From e5557bc3cebd1b89ed25eb75b71f33a85a4ffef1 Mon Sep 17 00:00:00 2001 From: sunway513 Date: Tue, 14 Apr 2026 07:13:07 +0000 Subject: [PATCH 1/2] feat: add/retune BF16 GEMM configs with FlyDSL backend for 6 models Tuned on MI355X (gfx950) with all backends competing (ASM, hipBLASLt, Triton, FlyDSL). New tuned configs for Llama 70B, Llama 405B, Qwen 32B. Re-tuned existing configs for GPT-OSS, DSV3, Kimi-K2 to include FlyDSL. Backend wins across 708 total shapes: - hipBLASLt: 472 (66.7%) - ASM: 131 (18.5%) - FlyDSL: 70 (9.9%) - Triton: 7 (1.0%) - Mixed/other: 28 (4.0%) --- .../model_configs/dsv3_bf16_tuned_gemm.csv | 84 ++- .../model_configs/gptoss_bf16_tuned_gemm.csv | 186 ++---- .../model_configs/kimik2_bf16_tuned_gemm.csv | 546 ++++-------------- .../llama405B_bf16_tuned_gemm.csv | 157 +++++ .../llama70B_bf16_tuned_gemm.csv | 157 +++++ .../model_configs/qwen32B_bf16_tuned_gemm.csv | 157 +++++ 6 files changed, 711 insertions(+), 576 deletions(-) create mode 100644 aiter/configs/model_configs/llama405B_bf16_tuned_gemm.csv create mode 100644 aiter/configs/model_configs/llama70B_bf16_tuned_gemm.csv create mode 100644 aiter/configs/model_configs/qwen32B_bf16_tuned_gemm.csv diff --git a/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv b/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv index dee5e9b994..014f64c631 100644 --- a/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv @@ -1,27 +1,59 @@ cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,1,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,14,7.4498,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0117,0.49,494.63 -256,1,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,63.0464,auto,0.0,3.67,3675.33 -256,2,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,7.2696,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0234,1.01,508.93 -256,2,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,64.5459,auto,0.0,7.18,3590.67 -256,4,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,11,7.7755,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0322,1.89,479.64 -256,4,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,64.2502,auto,0.0,14.42,3608.65 -256,8,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,7.8799,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0303,3.73,480.82 -256,8,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,64.6376,auto,0.0,28.67,3589.91 -256,16,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,16,7.8452,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0312,7.48,498.09 -256,16,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.0337,auto,0.0,57.0,3573.78 -256,32,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,14,7.7283,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0288,15.2,536.36 -256,32,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,64.518,auto,0.0,114.9,3613.92 -256,48,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,14,7.9838,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0282,22.06,548.95 -256,48,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.2215,auto,0.0,170.5,3586.38 -256,64,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,7.9784,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0302,29.44,579.1 -256,64,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.4327,auto,0.0,226.6,3586.22 -256,80,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,14,8.5025,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0281,34.53,571.34 -256,80,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,84.1977,auto,0.0,220.12,2795.83 -256,96,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,16,8.6162,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0321,40.89,591.38 -256,96,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,86.0249,auto,0.0,258.53,2745.12 -256,112,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,12,9.9659,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0278,41.24,535.12 -256,112,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,86.1104,auto,0.0,301.32,2751.06 -256,128,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,12,10.1315,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0279,46.37,549.83 -256,128,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,86.9964,auto,0.0,340.86,2731.63 -256,256,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,8,12.4846,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0223,75.25,598.43 -256,256,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,120.6481,auto,0.0,491.57,2019.21 +256,1,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4051,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0117,0.5,497.61 +256,2,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4651,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0254,0.98,495.6 +256,4,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4101,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0361,1.98,503.29 +256,8,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.5587,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0342,3.88,501.25 +256,16,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,7.6922,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0281,7.63,507.99 +256,32,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,7.7252,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.032,15.2,536.58 +256,48,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.9759,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0295,22.09,549.5 +256,64,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,8.4443,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0322,27.82,547.15 +256,80,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,16,8.669,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0322,33.87,560.37 +256,96,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,16,9.1151,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0325,38.65,559.01 +256,112,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,9.5249,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0315,43.15,559.9 +256,128,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.7811,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0221,48.03,569.52 +256,256,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441168,0,11.9774,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,78.44,623.77 +256,1,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.1742,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0303,3.3,3302.33 +256,2,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,292,16,9.1774,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0284,6.6,3303.2 +256,80,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,15.6657,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0183,154.62,2027.51 +256,112,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440339,0,17.7572,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,190.97,1822.15 +256,128,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439770,0,18.1517,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB1_NTC7_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,213.51,1798.91 +256,256,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440339,0,22.0156,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,352.07,1591.1 +256,1,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439841,0,6.0249,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,1.57,1567.89 +256,2,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440041,0,5.6299,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,3.35,1679.54 +256,4,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440906,0,5.7354,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,6.58,1651.85 +256,8,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440305,0,5.8171,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,12.98,1634.99 +256,16,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440041,0,6.3749,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,23.69,1503.5 +256,32,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440288,0,6.8235,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2,0.0,44.26,1426.26 +256,48,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439842,0,7.3297,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,61.8,1347.88 +256,64,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439842,0,7.2513,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,83.29,1382.79 +256,80,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440291,0,8.8113,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,85.68,1154.71 +256,96,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439773,0,8.9138,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC3_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,101.64,1157.97 +256,112,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440247,0,9.6383,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC3_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,109.66,1086.23 +256,128,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440045,0,9.5386,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,126.64,1113.04 +256,256,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439779,0,10.6106,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,227.69,1111.76 +256,1,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439772,0,8.6754,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,3.38,3386.42 +256,2,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,8.4386,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0144,6.96,3483.63 +256,4,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440181,0,8.4056,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC3_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,13.97,3501.7 +256,8,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439977,0,8.9689,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,26.19,3289.99 +256,16,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,9.284,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,50.6,3194.21 +256,32,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440111,0,9.6079,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,97.79,3117.22 +256,48,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439773,0,10.8192,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC3_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,130.26,2795.48 +256,64,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439637,0,11.7601,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,159.78,2596.9 +256,80,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441443,0,12.4433,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,188.76,2478.02 +256,96,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439777,0,12.722,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB1_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,221.55,2446.91 +256,112,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440890,0,13.776,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,238.7,2281.11 +256,128,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440890,0,13.8942,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,270.48,2282.93 +256,256,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440054,0,16.5758,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,453.44,2055.93 +256,1,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,36.8269,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.29,6292.04 +256,2,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,36.7342,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,12.61,6309.19 +256,4,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,37.0902,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,24.98,6251.15 +256,8,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,36.9536,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,50.15,6279.31 +256,16,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440907,0,37.6556,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,98.44,6172.16 +256,32,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440891,0,38.4907,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,192.6,6057.64 +256,48,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440929,0,40.0143,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA4_NTB0_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,277.9,5845.64 +256,64,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440890,0,42.0445,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,352.65,5581.13 +256,80,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439982,0,44.2897,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x80x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_5_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,418.46,5315.06 +256,96,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440514,0,44.5354,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,499.38,5302.5 +256,112,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440209,0,46.6162,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,556.61,5081.82 +256,128,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440209,0,46.3567,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,639.69,5126.37 +256,256,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,73.0382,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,812.01,3335.43 diff --git a/aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv b/aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv index d5ab3948d1..2498e04776 100644 --- a/aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv @@ -1,130 +1,58 @@ cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,1,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,11,4.4021,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0156,0.17,168.85 -256,1,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,9.7108,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0082,1.52,1519.59 -256,1,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,8.6543,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0097,1.36,1364.22 -256,1,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,9.1855,auto,0.0,2.57,2570.02 -256,1,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.2012,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.008,2.63,2634.29 -256,2,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,11,4.4168,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0156,0.33,169.65 -256,2,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.1765,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0109,3.21,1609.26 -256,2,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,8.4399,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0102,2.8,1400.04 -256,2,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,9.4381,auto,0.0,5.0,2502.71 -256,2,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.402000000000001,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0075,5.17,2589.3 -256,4,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,9,4.4035,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0098,0.67,172.9 -256,4,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,9.9316,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0122,5.94,1489.1 -256,4,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,4,8.1855,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0091,5.76,1445.96 -256,4,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,9.7799,auto,0.0,9.65,2418.1 -256,4,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.5586,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0074,10.21,2556.99 -256,8,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,9,4.4643,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0127,1.32,175.93 -256,8,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,4,9.7728,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0104,12.07,1517.75 -256,8,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,8.6642,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0064,10.89,1370.62 -256,8,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,10.2907,auto,0.0,18.34,2303.5 -256,8,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.7116,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0077,20.14,2529.05 -256,16,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,10,4.8658,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0146,2.42,171.31 -256,16,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,4,9.8954,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.01,23.84,1507.74 -256,16,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,8.7178,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0062,21.65,1371.24 -256,16,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,10.4181,auto,0.0,36.23,2286.04 -256,16,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.0177,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0077,39.26,2475.28 -256,32,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,15,4.3956,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0234,5.37,211.53 -256,32,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.7246,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0099,48.52,1552.12 -256,32,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,9.5423,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0107,39.56,1269.28 -256,32,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,11.2956,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0138,66.84,2128.21 -256,32,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.2183,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0077,77.24,2455.6 -256,48,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,13,4.7396,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0199,7.47,216.48 -256,48,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,4,10.6698,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0101,66.34,1430.94 -256,48,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,4,9.5586,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.009000000000000001,59.24,1283.62 -256,48,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,11.8153,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0134,95.85,2053.49 -256,48,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,3,13.0321,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0077,108.62,2321.9 -256,64,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,15,4.9073,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.025,9.62,228.7 -256,64,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,10.5605,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0074,89.36,1462.23 -256,64,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,9.6563,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0067,78.18,1286.96 -256,64,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,12.3105,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0137,122.66,1989.02 -256,64,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,13.5485,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0077,139.31,2252.29 -256,80,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,9,4.8408,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0184,12.18,251.73 -256,80,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,12.6552,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0135,149.14,1952.49 -256,80,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,14.0135,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0076,168.36,2195.83 -256,96,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,15,5.1844,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.026000000000000002,13.65,253.61 -256,96,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,13.2504,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0137,170.93,1881.63 -256,96,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,14.5394,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0076,194.72,2134.01 -256,112,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,9,5.0344,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,16.4,280.29 -256,112,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,17.4414,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0048,151.5,1442.29 -256,112,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,19.2107,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,171.94,1628.43 -256,128,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,8,4.7402,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.016,19.91,317.99 -256,128,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,11.9095,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0074,158.48,1355.07 -256,128,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,11.2484,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0035,134.24,1160.88 -256,128,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,17.5461,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0048,172.11,1446.41 -256,128,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,19.2945,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,195.65,1634.62 -256,256,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,6.5053,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0157,29.01,350.08 -256,256,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,15.4372,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0041,244.53,1135.63 -256,256,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,15.4366,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,195.63,927.64 -304,1,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618599,0,17.0412,auto,0.0,1.73,1731.52 -304,1,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618608,0,10.1359,auto,0.0,2.33,2329.04 -304,1,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,10.3001,auto,0.0,0.07,72.16 -304,2,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618591,0,17.2327,auto,0.0,3.42,1713.21 -304,2,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,10.1022,auto,0.0,4.67,2338.19 -304,2,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618601,0,10.4159,auto,0.0,0.14,71.94 -304,4,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618599,0,17.3212,auto,0.0,6.81,1706.3 -304,4,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,10.3169,auto,0.0,9.15,2292.24 -304,4,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618601,0,10.3779,auto,0.0,0.28,73.36 -304,8,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618599,0,17.2979,auto,0.0,13.64,1712.3 -304,8,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,10.1527,auto,0.0,18.59,2334.81 -304,8,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,10.8012,auto,0.0,0.55,72.71 -304,16,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618599,0,17.3674,auto,0.0,27.17,1712.82 -304,16,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,10.3801,auto,0.0,36.37,2294.41 -304,16,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618601,0,11.1528,auto,0.0,1.06,74.74 -304,32,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618591,0,18.8286,auto,0.0,50.12,1593.49 -304,32,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618604,0,12.1128,auto,0.0,62.33,1984.63 -304,32,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,11.4643,auto,0.0,2.06,81.1 -304,48,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618578,0,21.9233,auto,0.0,64.57,1380.23 -304,48,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618592,0,13.6643,auto,0.0,82.88,1775.62 -304,48,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,11.7001,auto,0.0,3.02,87.7 -304,64,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618571,0,22.6181,auto,0.0,83.45,1349.15 -304,64,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618592,0,14.0454,auto,0.0,107.5,1743.34 -304,64,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618593,0,11.8327,auto,0.0,3.99,94.85 -304,80,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618571,0,22.5254,auto,0.0,104.74,1366.07 -304,80,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618590,0,15.9991,auto,0.0,117.97,1544.41 -304,80,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618593,0,11.6769,auto,0.0,5.05,104.36 -304,96,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618571,0,22.5065,auto,0.0,125.79,1378.59 -304,96,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618590,0,16.6728,auto,0.0,135.85,1495.39 -304,96,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618593,0,11.8916,auto,0.0,5.95,110.57 -304,112,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618570,0,24.6222,auto,0.0,134.15,1270.53 -304,112,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618590,0,17.3106,auto,0.0,152.65,1453.19 -304,112,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618593,0,11.9001,auto,0.0,6.94,118.58 -304,128,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618570,0,24.8097,auto,0.0,152.15,1271.24 -304,128,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618590,0,17.9002,auto,0.0,168.71,1417.8 -304,128,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618593,0,11.5906,auto,0.0,8.14,130.05 -80,1,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618599,0,17.0412,auto,0.0,1.73,1731.52 -80,1,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618608,0,10.1359,auto,0.0,2.33,2329.04 -80,1,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,10.3001,auto,0.0,0.07,72.16 -80,2,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618591,0,17.2327,auto,0.0,3.42,1713.21 -80,2,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,10.1022,auto,0.0,4.67,2338.19 -80,2,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618601,0,10.4159,auto,0.0,0.14,71.94 -80,4,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618599,0,17.3212,auto,0.0,6.81,1706.3 -80,4,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,10.3169,auto,0.0,9.15,2292.24 -80,4,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618601,0,10.3779,auto,0.0,0.28,73.36 -80,8,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618599,0,17.2979,auto,0.0,13.64,1712.3 -80,8,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,10.1527,auto,0.0,18.59,2334.81 -80,8,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,10.8012,auto,0.0,0.55,72.71 -80,16,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618599,0,17.3674,auto,0.0,27.17,1712.82 -80,16,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,10.3801,auto,0.0,36.37,2294.41 -80,16,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618601,0,11.1528,auto,0.0,1.06,74.74 -80,32,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618591,0,18.8286,auto,0.0,50.12,1593.49 -80,32,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618604,0,12.1128,auto,0.0,62.33,1984.63 -80,32,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,11.4643,auto,0.0,2.06,81.1 -80,48,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618578,0,21.9233,auto,0.0,64.57,1380.23 -80,48,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618592,0,13.6643,auto,0.0,82.88,1775.62 -80,48,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618603,0,11.7001,auto,0.0,3.02,87.7 -80,64,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618571,0,22.6181,auto,0.0,83.45,1349.15 -80,64,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618592,0,14.0454,auto,0.0,107.5,1743.34 -80,64,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618593,0,11.8327,auto,0.0,3.99,94.85 -80,80,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618571,0,22.5254,auto,0.0,104.74,1366.07 -80,80,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618590,0,15.9991,auto,0.0,117.97,1544.41 -80,80,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618593,0,11.6769,auto,0.0,5.05,104.36 -80,96,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618571,0,22.5065,auto,0.0,125.79,1378.59 -80,96,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618590,0,16.6728,auto,0.0,135.85,1495.39 -80,96,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618593,0,11.8916,auto,0.0,5.95,110.57 -80,112,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618570,0,24.6222,auto,0.0,134.15,1270.53 -80,112,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618590,0,17.3106,auto,0.0,152.65,1453.19 -80,112,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618593,0,11.9001,auto,0.0,6.94,118.58 -80,128,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618570,0,24.8097,auto,0.0,152.15,1271.24 -80,128,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,618590,0,17.9002,auto,0.0,168.71,1417.8 -80,128,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,triton,618593,0,11.5906,auto,0.0,8.14,130.05 \ No newline at end of file +256,1,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,4.5547,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0234,0.16,163.19 +256,2,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.5902,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0195,0.32,163.24 +256,4,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.9463,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0117,0.6,153.92 +256,8,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,9,4.6722,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0137,1.26,168.1 +256,16,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,5.0352,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0176,2.34,165.54 +256,32,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,5.0532,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0256,4.67,184.0 +256,48,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,11,4.9391,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0192,7.17,207.74 +256,64,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,13,5.3586,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0211,8.81,209.44 +256,80,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,14,5.6366,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0209,10.46,216.19 +256,96,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,9,5.6999,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0149,12.42,230.67 +256,112,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,9,5.9801,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0172,13.81,235.96 +256,128,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,9,5.5627,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0175,16.97,270.97 +256,256,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,7.3707,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0129,25.61,308.98 +256,1,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.6921,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0117,1.52,1522.53 +256,2,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.7094,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0109,3.04,1520.93 +256,4,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.7085,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0114,6.08,1523.32 +256,8,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440911,0,9.8686,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,11.95,1503.01 +256,16,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,10.0588,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0098,23.46,1483.25 +256,32,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,10.1087,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0099,46.68,1493.15 +256,48,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,10.5885,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0073,66.85,1441.93 +256,64,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,10.6945,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0074,88.24,1443.91 +256,128,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,11.7291,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0074,160.92,1375.91 +256,256,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,15.4587,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0041,244.19,1134.05 +256,1,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440305,0,6.7208,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,1.76,1756.69 +256,1,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440041,0,8.6117,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,2.74,2741.26 +256,2,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439633,0,6.2546,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,3.77,1889.2 +256,2,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439633,0,8.4128,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,5.61,2807.73 +256,4,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439841,0,6.848,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,6.89,1728.37 +256,4,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439841,0,8.4291,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,11.2,2805.61 +256,8,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440305,0,6.4007,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,14.74,1855.32 +256,8,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,9.0233,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,20.92,2627.04 +256,16,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439841,0,6.9819,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,27.03,1712.17 +256,16,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440041,0,9.3274,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,40.47,2553.36 +256,32,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439978,0,7.7251,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB2_NTC1_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,48.87,1567.86 +256,32,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440288,0,10.8176,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2,0.0,69.79,2222.25 +256,48,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440042,0,8.6938,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x48x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA6_NTB1_NTC0_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,65.13,1411.3 +256,48,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439773,0,11.9644,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC3_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,94.65,2027.9 +256,64,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440174,0,9.6016,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,78.63,1294.29 +256,64,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,12.3488,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0136,122.27,1982.86 +256,80,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,13.0497,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0137,144.63,1893.46 +256,96,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,13.4284,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0137,168.67,1856.69 +256,112,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439981,0,15.3173,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB0_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,172.51,1642.3 +256,128,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440893,0,11.3399,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,133.15,1151.51 +256,128,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439981,0,15.1696,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB0_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,199.08,1673.0 +256,256,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439779,0,12.4644,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,242.28,1148.84 +256,1,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,10.3932,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,2.84,2839.09 +256,2,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,10.9392,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,5.39,2698.84 +256,4,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,10.4556,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,11.28,2826.73 +256,8,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,11.0667,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,21.32,2676.43 +256,16,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,10.9761,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,42.99,2710.18 +256,32,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.2181,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0077,77.24,2455.64 +256,48,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,3,12.9274,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0076,109.5,2340.7 +256,64,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,13.3072,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0077,141.84,2293.13 +256,80,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,13.9214,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0076,169.47,2210.35 +256,96,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,14.4736,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0075,195.61,2143.71 +256,112,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439917,0,17.0876,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA1_NTB1_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,193.3,1830.75 +256,128,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441443,0,16.5912,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,227.52,1900.96 diff --git a/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv b/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv index ede474d05b..81ec422d62 100644 --- a/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv @@ -1,422 +1,126 @@ cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,1,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,9.3501,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,3.24,3240.19 -256,2,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,9.3766,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,6.46,3233.03 -256,4,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,9.7801,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,12.38,3103.42 -256,8,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,10.621,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,22.81,2864.71 -256,16,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,11.4248,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,42.4,2676.17 -256,24,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,12.3343,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,58.91,2490.87 -256,32,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,13.276,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,72.98,2325.36 -256,40,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,7,14.3139,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0194,84.61,2167.13 -256,48,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.6708,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.018,99.06,2124.53 -256,56,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,15.0293,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0176,112.82,2083.73 -256,64,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.6227,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.018,132.52,2151.82 -256,72,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,7,15.0844,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0198,144.52,2095.8 -256,80,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,15.8189,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0175,153.12,2007.88 -256,88,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,7,16.1244,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.02,165.24,1979.04 -256,96,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,6,16.1623,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0181,179.84,1983.59 -256,112,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,19.5749,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0104,173.24,1652.95 -256,128,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,19.9118,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0105,194.64,1639.9 -256,136,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,20.2326,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0106,203.52,1621.23 -256,144,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,20.2948,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0106,214.83,1623.58 -256,256,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,28.5227,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.006,271.75,1228.11 -256,360,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,43.3357,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,251.52,852.86 -256,368,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,44.0418,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,252.99,842.56 -256,376,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,43.5006,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,261.71,856.45 -256,384,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,44.2105,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,262.98,846.06 -256,1,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,37,8,6.6176,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,1.43,1427.47 -256,2,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,37,8,6.2737,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,3.01,1507.18 -256,4,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,37,8,6.3438,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,5.95,1493.43 -256,8,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,215,4,6.6059,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,11.43,1439.75 -256,16,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,215,4,6.7587,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,22.34,1418.12 -256,24,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,215,4,7.3113,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,30.98,1321.03 -256,32,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,215,4,7.9776,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,37.85,1219.92 -256,48,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,3,9.043,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0058,50.09,1092.51 -256,64,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,9.2107,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0027,65.57,1088.63 -256,80,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,9.3113,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0076,81.08,1092.7 -256,96,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,9.668,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0076,93.71,1067.64 -256,112,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,10.0235,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0028,105.45,1044.48 -256,128,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,10.3007,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0028,117.27,1030.69 -256,176,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,10.6186,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0028,156.42,1041.49 -256,184,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,11.3581,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0028,152.88,980.18 -256,192,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,11.3328,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0028,159.88,988.87 -256,256,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,12.3141,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,196.19,957.97 -256,1,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.4851,auto,0.0,0.94,937.22 -256,2,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.0614,auto,0.0,1.66,832.33 -256,24,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.4326,auto,0.0,22.71,996.14 -256,32,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.5855,auto,0.0,29.27,979.0 -256,40,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.5363,auto,0.0,30.3,824.19 -256,48,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.4199,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,31.36,722.24 -256,56,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.4325,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,36.51,732.28 -256,64,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.0343,auto,0.0,53.32,950.31 -256,80,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,5.9722,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,56.18,825.76 -256,88,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.0027,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,61.49,833.84 -256,96,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.2717,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,64.2,809.83 -256,104,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.3442,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,68.76,812.2 -256,112,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,2,1,6.1142,_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,0.0,76.83,854.81 -256,120,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.4135,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,78.48,826.42 -256,128,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.7015,auto,0.0,94.16,942.55 -256,136,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,6.4072,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,89.03,850.24 -256,144,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.6452,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,90.89,830.88 -256,152,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.6893,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,95.31,836.44 -256,160,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.7174,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,99.9,843.91 -256,168,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.7783,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,103.96,847.21 -256,176,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.8942,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,107.08,843.66 -256,184,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.846,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,112.73,860.36 -256,192,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.9534,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,115.82,857.68 -256,200,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.149,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,117.34,844.53 -256,208,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.1306,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,122.35,857.04 -256,216,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.1338,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,127.0,867.0 -256,224,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.1669,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,131.09,873.27 -256,232,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.1636,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,135.84,883.97 -256,240,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,6.8718,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,146.49,932.24 -256,248,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.3464,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,141.59,882.04 -256,256,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,6.8851,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,155.95,951.85 -256,264,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,91,1,8.7723,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,126.23,755.48 -256,272,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,91,1,8.7224,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,130.8,768.26 -256,280,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.2951,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,160.99,928.68 -256,288,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.4172,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,162.86,923.33 -256,320,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,91,1,8.8313,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,151.98,808.88 -256,328,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.7361,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,177.83,932.92 -256,344,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.5668,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,190.68,973.28 -256,352,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,7.7114,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,191.46,964.59 -256,360,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.6848,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,196.49,977.52 -256,368,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,91,1,9.0153,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,171.21,841.43 -256,376,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.7297,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,204.03,990.92 -256,384,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.7945,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,206.64,992.15 -256,392,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,9.4617,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,173.77,825.12 -256,400,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,8.0566,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,208.24,978.16 -256,424,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,8.1302,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,218.74,996.52 -256,448,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,8.5161,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,220.65,977.33 -256,456,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,9.6885,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,197.41,866.67 -256,488,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,9.6501,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,212.1,900.68 -256,496,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,8.7233,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,238.49,1004.84 -256,1,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,47,8,9.1452,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,3.21,3212.46 -256,2,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,47,8,8.7896,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,6.68,3344.52 -256,4,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,215,4,8.961,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,13.11,3284.66 -256,8,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,215,4,9.4235,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,24.92,3131.27 -256,16,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,10.6246,auto,0.0,44.21,2791.17 -256,32,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.3815,auto,0.0,82.55,2631.46 -256,48,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,2,13.0225,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0034,108.22,2322.51 -256,64,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.9472,auto,0.0,157.28,2556.23 -256,80,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,14.0021,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0034,167.75,2202.15 -256,96,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,14.2839,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0034,197.33,2179.35 -256,112,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,16.1354,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,203.8,1947.55 -256,128,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,15.4031,auto,0.0,243.98,2059.29 -256,200,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,21.0906,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,278.42,1566.89 -256,208,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,21.1448,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,288.81,1569.84 -256,216,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,21.2766,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,298.06,1567.05 -256,224,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,21.381,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,307.59,1566.29 -256,232,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,21.7124,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,313.72,1549.18 -256,240,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,21.7644,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,323.76,1552.25 -256,248,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,21.782,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,334.28,1557.76 -256,256,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,22.1506,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,339.32,1538.5 -256,1,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,8.2673,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,0.67,667.7 -256,2,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,8.4853,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,1.3,652.33 -256,4,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,8.3665,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,2.63,665.2 -256,8,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,8.6306,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,5.1,651.85 -256,16,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,8.6867,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,10.14,661.55 -256,24,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,8.7444,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,15.11,671.0 -256,32,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,8.8861,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,19.82,673.9 -256,40,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,9.4432,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,23.32,646.94 -256,48,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,9.8912,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,26.71,629.85 -256,56,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,9.8987,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,31.14,641.58 -256,64,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,10.11,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,34.85,640.12 -256,72,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,10.0269,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,39.53,657.48 -256,80,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,277,8,10.8589,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,40.56,618.23 -256,88,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,10.9286,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,44.33,625.35 -256,96,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,277,8,11.4666,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,46.09,606.55 -256,104,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,10.8052,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,52.99,654.85 -256,112,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,10.9637,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,56.24,656.41 -256,120,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,10.9485,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,60.34,668.36 -256,136,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,8,12.5129,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0225,59.83,604.11 -256,168,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,10,12.8567,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0248,71.93,625.55 -256,176,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,8,12.9456,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0218,74.84,630.59 -256,184,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,8,12.4417,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0218,81.41,665.84 -256,192,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,9,13.1416,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.023,80.43,639.57 -256,200,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,277,8,13.3939,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,82.2,636.55 -256,208,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,277,8,13.6897,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,83.64,631.62 -256,216,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,277,8,14.0595,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,84.58,623.6 -256,224,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,13.7138,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0206,89.92,648.13 -256,232,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.7502,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0185,92.88,655.2 -256,240,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.205,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0184,100.05,691.4 -256,248,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.0434,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0188,104.67,709.23 -256,256,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.9448,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0188,101.06,672.05 -256,264,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.997,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0187,103.83,678.18 -256,272,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0363,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0186,106.68,684.89 -256,280,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,14.0317,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0203,109.85,693.73 -256,288,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,7,14.1267,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.02,112.23,697.61 -256,296,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.0558,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0187,115.93,709.73 -256,304,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,7,14.1491,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0193,118.28,713.59 -256,312,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0362,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0186,122.37,727.94 -256,320,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.5745,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.019,129.77,761.6 -256,328,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.1857,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0159,127.29,737.3 -256,336,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.618,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0189,135.83,776.91 -256,344,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.3843,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0185,131.65,743.92 -256,352,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.1549,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0187,136.9,764.51 -256,360,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5417,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,136.28,752.49 -256,368,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,14.5374,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0185,139.35,761.02 -256,376,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,14.6245,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0186,141.54,764.75 -256,384,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.5723,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0188,145.06,775.78 -256,392,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.6161,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,147.64,781.73 -256,400,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.3612,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,153.33,804.02 -256,408,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.7061,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,152.73,793.38 -256,416,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.7836,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,154.91,797.39 -256,424,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.7904,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0164,157.81,805.19 -256,432,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.9919,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,158.63,802.43 -256,440,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.8858,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,162.72,816.27 -256,448,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.0895,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0161,163.44,813.26 -256,456,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.2088,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,165.06,814.82 -256,464,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.177,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,168.3,824.49 -256,472,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.3007,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,169.82,825.72 -256,480,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.346,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0161,172.19,831.16 -256,488,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.5019,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,173.3,830.59 -256,496,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.4961,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,176.21,838.7 -256,504,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.5376,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,178.57,844.24 -256,512,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.6923,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0161,179.61,843.62 -256,1,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,9.3379,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,1.57,1573.86 -256,2,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,8.8497,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,3.32,1662.53 -256,4,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,9.3436,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,6.28,1578.15 -256,8,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,8.9677,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,13.1,1651.61 -256,16,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,39,16,9.005,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,26.08,1659.33 -256,24,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,10.0591,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,35.02,1498.47 -256,32,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,10.3083,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,45.57,1474.96 -256,40,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,10.9422,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,53.66,1401.49 -256,72,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.7492,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0174,76.87,1153.5 -256,80,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.5885,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0179,86.43,1176.79 -256,88,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,13.9756,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0162,92.44,1153.57 -256,96,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.8999,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0181,101.39,1169.28 -256,104,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.1683,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0157,107.76,1156.38 -256,112,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,7,14.2858,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0203,115.09,1156.05 -256,120,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.4044,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0182,122.3,1155.63 -256,128,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,14.3206,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0202,131.21,1171.54 -256,136,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,7,14.5367,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0202,137.34,1163.14 -256,144,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.7263,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0162,143.55,1157.07 -256,152,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.8838,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,149.92,1153.63 -256,160,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.8739,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0159,157.91,1163.21 -256,168,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.2068,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.016,162.18,1146.37 -256,176,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.105,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,171.05,1162.77 -256,184,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.2228,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0161,177.44,1162.38 -256,192,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.6664,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,192.18,1215.42 -256,200,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.3582,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,191.17,1169.2 -256,208,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.6999,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,194.49,1152.11 -256,216,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.6515,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,202.59,1164.04 -256,224,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.7154,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,209.24,1167.65 -256,232,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.0842,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,211.75,1149.03 -256,240,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.9649,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,220.69,1165.82 -256,248,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.3657,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,222.46,1145.28 -256,256,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.2678,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,231.01,1160.23 -256,264,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.3531,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0164,236.99,1162.19 -256,272,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.465,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,242.51,1162.25 -256,280,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.6496,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0164,246.88,1157.24 -256,288,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,17.1636,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,246.33,1130.22 -256,296,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.3667,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,250.21,1124.55 -256,304,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.623,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0137,253.23,1115.63 -256,312,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.8372,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,256.78,1109.58 -256,320,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.7354,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,264.87,1123.34 -256,328,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.226,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0137,264.19,1100.3 -256,336,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.1608,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,271.6,1111.46 -256,344,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.5378,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0137,272.41,1095.93 -256,352,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.2392,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0137,283.31,1121.06 -256,360,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.675,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0137,282.99,1101.92 -256,368,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8126,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0138,287.16,1100.82 -256,376,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.9182,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,291.77,1101.61 -256,384,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.9243,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0138,297.88,1108.18 -256,392,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,20.4007,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0105,282.08,1034.41 -256,400,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,20.3122,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0106,289.09,1045.37 -256,2,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.9943,auto,0.0,2.94,1475.83 -256,4,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.9934,auto,0.0,5.88,1482.25 -256,8,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.5511,auto,0.0,10.58,1344.4 -256,16,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.7742,auto,0.0,24.6,1588.91 -256,24,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.4124,auto,0.0,32.55,1424.26 -256,32,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.9819,auto,0.0,47.15,1572.0 -256,40,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,6.0779,auto,0.0,48.31,1308.75 -256,48,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.8419,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,51.49,1180.56 -256,56,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.4529,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,63.7,1270.77 -256,64,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,6.0235,auto,0.0,77.99,1381.77 -256,72,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,6.6035,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,80.03,1279.01 -256,80,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.8698,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,85.48,1247.32 -256,96,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,7.3467,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,95.91,1199.81 -256,104,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.4742,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,102.13,1195.77 -256,112,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,6.9158,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,118.87,1310.09 -256,128,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,6.8915,auto,0.0,136.33,1350.38 -256,136,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.6068,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,131.23,1239.54 -256,144,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,8.197,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,128.94,1165.28 -256,152,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,8.213,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,135.84,1177.99 -256,160,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.7499,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,151.54,1264.22 -256,168,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.951,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,155.09,1247.7 -256,192,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.9723,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,176.77,1290.61 -256,208,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,8.4509,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,180.66,1246.6 -256,224,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,8.5706,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,191.84,1257.86 -256,240,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,8.7346,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,201.68,1262.38 -256,248,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,331,1,8.8966,flydsl_gemm2_abf16_wbf16_bf16_t128x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,204.61,1253.21 -256,264,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,9.9865,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,194.04,1141.04 -256,272,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,9.9672,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,200.31,1155.58 -256,280,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,10.2274,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,200.95,1138.21 -256,288,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,9.0197,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,234.37,1304.22 -256,296,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,10.3616,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,209.68,1147.17 -256,304,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,9.6502,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,231.22,1244.47 -256,312,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,10.7268,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,213.49,1131.04 -256,320,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,9.3945,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,250.02,1304.51 -256,328,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,9.5916,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,251.0,1290.52 -256,336,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,10.4197,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,236.69,1199.75 -256,344,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,10.943,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,230.74,1153.6 -256,352,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,9.997,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,258.45,1275.05 -256,360,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,10.6719,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,247.6,1205.94 -256,368,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,10.1456,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,266.24,1280.6 -256,376,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,10.2371,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,269.59,1281.17 -256,384,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,10.0111,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,281.55,1322.36 -256,392,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.2829,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,255.01,1184.2 -256,400,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.445,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,256.53,1178.16 -256,408,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.167,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,268.18,1218.49 -256,416,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.2953,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,270.33,1215.53 -256,424,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.3604,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,273.95,1219.39 -256,432,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.8201,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,268.26,1182.35 -256,440,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.8181,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,273.28,1192.95 -256,448,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.7373,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,280.16,1211.64 -256,456,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.7334,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,285.26,1222.51 -256,464,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.7426,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,290.04,1232.02 -256,472,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.5632,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,299.61,1261.75 -256,480,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,12.0183,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,293.15,1224.2 -256,488,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.9779,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,299.05,1238.59 -256,496,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,12.0544,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,302.02,1240.92 -256,504,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.7306,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,315.36,1285.66 -256,512,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,151,1,11.9822,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,313.64,1268.91 -256,144,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,10,12.5466,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0241,63.18,612.12 -256,152,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,8,12.6565,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0222,66.11,616.35 -256,160,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,9,12.8035,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0224,68.79,618.71 -256,456,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,21.5821,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0107,310.17,1026.37 -256,464,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,21.6839,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0107,314.13,1027.59 -256,472,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,21.9029,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0107,316.35,1023.3 -256,480,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,21.9079,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0108,321.64,1029.05 -256,488,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,25.5097,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0061,280.83,888.9 -256,496,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,25.6402,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0061,283.98,889.48 -256,504,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,25.7917,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0061,286.87,889.34 -256,88,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,8.1679,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0058,50.84,643.89 -256,96,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,8.2325,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0058,55.02,644.81 -256,168,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,8.8181,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0058,89.9,652.16 -256,176,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,9.0191,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0057,92.08,643.07 -256,184,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,9.0376,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0057,96.07,647.19 -256,192,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,8.9939,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0058,100.73,655.8 -256,200,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,9.2267,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0057,102.28,644.58 -256,208,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,9.3175,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0058,105.34,643.58 -256,216,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,9.2818,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0057,109.81,651.35 -256,224,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,9.5115,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0057,111.12,640.79 -256,232,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,9.6187,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0058,113.81,638.76 -256,240,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,9.5099,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0057,119.08,651.23 -256,328,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,10.3411,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0028,149.66,651.17 -256,120,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,19.5331,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0106,186.01,1664.09 -256,152,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,20.6393,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0107,222.98,1603.68 -256,160,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,20.8121,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0106,232.77,1597.5 -256,168,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,21.2508,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0106,239.36,1571.5 -256,176,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,21.4824,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0106,248.06,1561.47 -256,184,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,21.6938,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0107,256.81,1553.1 -256,192,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,21.8515,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0106,266.04,1548.69 -256,200,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,26.2125,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.006,231.02,1296.7 -256,208,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,26.5685,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.006,237.04,1284.91 -256,216,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,26.6477,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.006,245.42,1286.66 -256,224,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,26.8323,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.006,252.76,1283.34 -256,232,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,27.1967,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.006,258.28,1271.61 -256,240,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,27.3724,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.006,265.47,1268.87 -256,264,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,28.4038,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.006,281.42,1238.48 -256,272,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,28.5123,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0061,288.84,1238.97 -256,280,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,28.8775,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0061,293.58,1228.45 -256,288,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,28.9429,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0061,301.28,1230.8 -256,104,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,9.9897,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0028,98.25,1040.64 -256,136,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,10.6637,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0028,120.36,1002.52 -256,144,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,10.5831,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0027,128.41,1017.12 -256,152,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,11.0581,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0028,129.72,980.1 -256,160,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,10.9111,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0028,138.39,1000.06 -256,168,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,11.0756,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0028,143.15,991.86 -256,200,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,12.2001,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,154.71,924.61 -256,208,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,12.0106,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,163.43,945.34 -256,216,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,11.7401,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,173.63,973.4 -256,224,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,12.2504,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,172.56,938.87 -256,232,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,12.1948,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,179.54,949.2 -256,240,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,11.8776,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,190.69,980.76 -256,416,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,13.9463,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,281.5,951.58 -256,424,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,16.3511,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,244.72,816.14 -256,432,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,14.0366,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,290.45,955.96 -256,440,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,13.6186,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,304.9,990.72 -256,456,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,13.8104,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,311.6,987.64 -256,4,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.5506,auto,0.0,3.69,929.8 -256,8,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.5407,auto,0.0,7.39,939.95 -256,16,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.3534,auto,0.0,15.42,997.33 -256,72,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,6.2647,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,48.21,775.44 -256,296,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,91,1,8.868,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,140.0,780.59 -256,304,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.5545,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,168.78,926.06 -256,312,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,7.9431,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,164.75,890.04 -256,336,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.5014,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,187.87,971.94 -256,512,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.2983,auto,0.0,258.79,1074.06 -256,1,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.3382,auto,0.0,1.38,1377.88 -256,1,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,227,8,6.4796,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,2.27,2268.11 -256,2,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,237,8,6.8986,flydsl_gemm2_abf16_wbf16_bf16_t32x256x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,4.26,2132.73 -256,16,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.4158,auto,0.0,31.67,2014.92 -256,24,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.5691,auto,0.0,46.55,1991.42 -256,24,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.3968,auto,0.0,61.83,2614.99 -256,32,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,6.5764,auto,0.0,71.43,2311.96 -256,40,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.9803,auto,0.0,73.58,1921.66 -256,40,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.1517,auto,0.0,96.65,2476.81 -256,48,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.5938,auto,0.0,81.99,1799.73 -256,56,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.8118,auto,0.0,93.29,1770.08 -256,56,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.6431,auto,0.0,130.04,2403.87 -256,64,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.0737,auto,0.0,116.37,1948.13 -256,72,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,11.3646,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,93.01,1395.54 -256,80,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,11.4417,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,102.64,1397.59 -256,88,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,211,1,7.2019,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,89.69,1206.86 -256,88,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,11.327,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,114.05,1423.31 -256,96,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,11.5201,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,122.33,1410.83 -256,104,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,15.4702,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,197.38,2021.76 -256,120,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,6.9667,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,126.43,1318.16 -256,120,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,15.9439,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,220.98,1980.19 -256,136,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,17.2568,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,231.39,1846.63 -256,144,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,10.3748,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,203.76,1642.38 -256,144,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,17.4208,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,242.69,1837.71 -256,152,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,16.9128,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,263.87,1901.62 -256,160,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,17.185,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,273.36,1880.08 -256,168,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,10.5421,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,233.94,1653.62 -256,168,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,17.6956,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,278.74,1834.17 -256,176,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,8.2681,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,156.25,1214.72 -256,176,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,11.5344,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,224.0,1522.72 -256,176,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,17.4199,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,296.64,1871.66 -256,184,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,11.3243,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,238.53,1562.54 -256,184,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,17.2726,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,312.76,1896.16 -256,192,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,11.2561,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,250.4,1583.66 -256,192,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,17.9259,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,314.47,1835.28 -256,256,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.7892,auto,0.0,213.79,1282.5 -256,1,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,36,8,6.0032,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,0.79,787.04 -256,2,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,36,8,5.6343,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,1.67,839.66 -256,32,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,215,4,6.5514,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,23.05,750.25 -256,40,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,215,4,6.9801,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,27.04,711.21 -256,48,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,215,4,7.3909,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,30.64,678.33 -256,56,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,215,4,7.7636,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,34.04,652.1 -256,152,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,11.208,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,199.09,1531.98 -256,160,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,12.0205,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,195.4,1439.33 -256,264,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,13.0316,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,297.39,1458.41 -256,272,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,13.1377,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,303.93,1456.61 -256,296,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,13.164,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,330.09,1483.57 -256,304,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,13.4057,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,332.9,1466.61 -256,312,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,13.5965,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,336.87,1455.66 -256,320,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,13.4864,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,348.32,1477.26 -256,328,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,13.5958,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,354.16,1475.01 -256,336,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,13.7168,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,359.6,1471.56 -256,344,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,13.8203,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,365.4,1470.02 -256,352,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,14.1414,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,365.41,1445.91 -256,360,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,14.1018,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,374.76,1459.27 -256,368,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,14.6026,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,369.95,1418.2 -256,376,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,14.543,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,379.54,1433.02 -256,408,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,8.3591,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,204.72,951.59 -256,416,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,8.3326,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,209.4,963.47 -256,464,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,9.6804,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,201.04,875.02 -256,472,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,9.6761,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,204.6,883.03 -256,480,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,8.7323,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,230.55,986.91 -256,504,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,8.4812,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,249.25,1042.21 -256,128,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,217,8,11.5909,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,60.79,641.74 -256,200,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,301,1,8.505,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,172.6,1224.22 -256,232,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,8.6378,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,197.14,1262.31 -256,24,1536,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,215,4,6.8803,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,16.46,707.25 -256,432,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,8.1125,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,223.35,1007.78 -256,440,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,121,1,9.56,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,193.04,862.9 -256,4,7168,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,45,4,7.3877,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,7.95,1995.96 -256,184,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,271,1,7.7607,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,174.03,1309.97 -256,216,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,91,1,9.8614,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,160.77,1080.76 -256,48,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,277,8,12.0728,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,58.37,1281.1 +256,8,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.8407,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.027,5.62,717.52 +256,16,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.0143,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0309,10.99,717.05 +256,24,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.1591,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0341,16.19,719.14 +256,32,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.1403,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0317,21.64,735.64 +256,40,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.0707,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0302,27.28,756.96 +256,48,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.569,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0293,30.84,727.04 +256,56,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,9.5081,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0316,32.42,667.94 +256,64,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,9.516,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0292,37.02,680.08 +256,72,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,9.0712,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0314,43.69,726.75 +256,80,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.4329,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,46.69,711.69 +256,88,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,14,10.5793,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0287,45.79,646.0 +256,96,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.051,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0228,52.58,691.97 +256,104,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.3236,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0215,55.46,685.4 +256,112,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.7718,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,63.1,736.47 +256,120,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.9195,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0228,66.6,737.69 +256,128,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.7789,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,65.37,690.08 +256,136,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.1736,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0221,67.0,676.52 +256,144,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441425,0,11.3039,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,70.13,679.41 +256,168,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,177,8,13.0017,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,71.13,618.57 +256,176,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,10,12.9091,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0241,75.05,632.37 +256,184,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,10,13.0368,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0241,77.7,635.44 +256,192,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,9,13.1531,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0226,80.36,639.01 +256,200,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,12.8957,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0226,85.38,661.14 +256,208,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,287,8,13.2158,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,86.64,654.27 +256,216,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,7,13.4535,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0201,88.38,651.69 +256,224,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,13.554,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0203,90.98,655.77 +256,232,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.3604,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0219,95.59,674.32 +256,240,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.4529,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,98.21,678.66 +256,248,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.7249,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,99.47,674.02 +256,256,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,13.6667,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0205,103.12,685.73 +256,264,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0371,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.019,103.53,676.24 +256,272,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,13.7697,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0183,108.74,698.15 +256,280,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.2137,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0186,108.45,684.84 +256,288,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440174,0,13.943,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,113.71,706.8 +256,296,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.254,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0186,114.32,699.86 +256,304,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.1592,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0187,118.19,713.08 +256,312,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.8787,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0186,123.76,736.2 +256,320,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.0558,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0184,125.33,735.52 +256,328,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.2261,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0164,126.93,735.21 +256,336,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.6004,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0187,126.69,724.64 +256,344,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439768,0,14.7288,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB0_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,128.57,726.52 +256,352,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.3162,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0188,135.35,755.9 +256,360,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5766,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0163,135.96,750.69 +256,368,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.6534,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,138.25,755.0 +256,376,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5889,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0162,141.88,766.62 +256,384,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.5123,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0184,145.66,778.99 +256,392,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,14.9855,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0185,144.0,762.46 +256,400,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.6799,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,150.0,786.56 +256,408,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.6347,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0161,153.47,797.25 +256,416,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.862,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0164,154.09,793.18 +256,424,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,6,15.3921,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0188,151.64,773.72 +256,432,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.9275,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,159.31,805.89 +256,440,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.957,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0161,161.94,812.38 +256,448,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.266,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,161.55,803.85 +256,456,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.4627,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,162.34,801.44 +256,464,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.3917,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,165.96,812.99 +256,472,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.3552,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,169.22,822.79 +256,480,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.4679,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,170.83,824.61 +256,488,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.5389,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,172.89,828.62 +256,496,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.6968,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,173.95,827.98 +256,504,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.6448,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,177.35,838.45 +256,512,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.667,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,179.91,844.98 +256,1,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.3967,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0283,1.75,1750.26 +256,2,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.0166,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0298,3.66,1835.3 +256,8,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.0761,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0295,14.54,1833.95 +256,16,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.7294,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0303,26.91,1711.71 +256,24,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.8754,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,35.68,1526.35 +256,40,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.3428,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0209,56.77,1482.71 +256,48,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.8915,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0205,59.26,1300.63 +256,56,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.9143,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,75.32,1429.09 +256,64,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440911,0,12.1182,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,77.53,1297.94 +256,72,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.2099,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0214,80.01,1200.59 +256,80,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.2676,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,88.52,1205.25 +256,88,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.6796,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0214,94.44,1178.53 +256,96,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0885,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.018,100.03,1153.63 +256,104,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.1454,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0177,107.93,1158.26 +256,112,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.3991,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,114.19,1146.95 +256,120,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5062,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0159,121.44,1147.52 +256,128,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5238,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0163,129.38,1155.15 +256,136,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.6787,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0163,136.01,1151.89 +256,144,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.633,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,144.46,1164.45 +256,152,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.802,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,150.75,1160.01 +256,160,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.0656,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,155.91,1148.41 +256,168,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.0442,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,163.93,1158.76 +256,176,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.2535,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,169.38,1151.45 +256,184,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.1892,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,177.83,1164.95 +256,192,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.3065,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,184.14,1164.59 +256,200,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.4285,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,190.3,1163.88 +256,208,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.5531,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,196.32,1162.98 +256,216,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.7414,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,201.44,1157.39 +256,224,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,16.0465,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,204.93,1143.56 +256,232,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,16.011,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,212.71,1154.28 +256,240,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.9839,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,220.42,1164.44 +256,248,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.2255,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0165,224.38,1155.18 +256,256,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.5219,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0163,227.46,1142.38 +256,264,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.3885,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0161,236.48,1159.68 +256,272,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.5055,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,241.92,1159.4 +256,280,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.8484,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,243.96,1143.59 +256,288,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.6915,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0165,253.29,1162.19 +256,296,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.5214,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0137,248.0,1114.62 +256,304,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.5882,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0139,253.73,1117.84 +256,312,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439939,0,17.6727,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA5_NTB1_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,259.17,1119.91 +256,320,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.8718,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,262.85,1114.77 +256,328,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.2496,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0137,263.84,1098.87 +256,336,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.1987,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,271.04,1109.15 +256,344,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439801,0,18.2568,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA7_NTB3_NTC2_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,276.61,1112.8 +256,376,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8005,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0138,293.59,1108.5 +256,384,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8832,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0138,298.53,1110.59 +256,4,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.3715,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0286,12.92,3238.74 +256,8,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.7582,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0288,24.82,3118.0 +256,16,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.7144,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0212,45.21,2853.6 +256,24,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,12.8635,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,56.49,2388.39 +256,32,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.0746,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0219,74.1,2361.19 +256,40,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,14.5861,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,83.03,2126.68 +256,48,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.8189,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0181,98.07,2103.29 +256,56,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440034,0,15.2733,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA1_NTB3_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,111.01,2050.44 +256,72,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,7,15.5887,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0197,139.84,2028.0 +256,88,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,6,16.3983,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0179,162.48,1945.99 +256,360,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440054,0,7.306,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,206.67,1028.2 +256,1,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.9485,auto,0.0,1.48,1486.39 +256,48,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439773,0,5.878,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC3_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,59.94,1374.16 +256,72,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,6.5814,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,80.3,1283.31 +256,80,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,6.6122,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,88.81,1295.91 +256,120,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,6.8966,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,127.72,1331.56 +256,128,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,6.8938,auto,0.0,136.29,1349.92 diff --git a/aiter/configs/model_configs/llama405B_bf16_tuned_gemm.csv b/aiter/configs/model_configs/llama405B_bf16_tuned_gemm.csv new file mode 100644 index 0000000000..96e4fc2d08 --- /dev/null +++ b/aiter/configs/model_configs/llama405B_bf16_tuned_gemm.csv @@ -0,0 +1,157 @@ +cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +256,1,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,15.5965,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0343,4.84,4843.06 +256,16,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,17.691,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0337,68.28,4301.37 +256,32,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440300,0,20.6808,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA6_NTB3_NTC1_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,116.82,3708.44 +256,64,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439712,0,22.8339,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,211.61,3411.14 +256,128,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439782,0,27.4051,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,352.62,2929.44 +256,256,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440199,0,35.505,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,544.36,2395.88 +256,512,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440417,0,50.9414,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA1_NTB2_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,758.81,1857.7 +256,1024,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440657,0,82.9033,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,932.53,1372.33 +256,2048,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440214,0,152.5136,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1013.8,996.92 +256,4096,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439533,0,255.1222,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1212.12,896.0 +256,8192,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440614,0,490.0125,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1262.16,778.92 +256,16384,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440195,0,913.2465,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB3_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1354.45,753.21 +256,32768,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440526,0,1647.3102,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1501.78,789.31 +256,1,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,28.5502,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,5.29,5290.22 +256,16,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,32.9676,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0344,73.28,4600.48 +256,32,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441403,0,32.4566,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,148.87,4693.6 +256,64,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441380,0,36.2393,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,266.66,4240.75 +256,128,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440128,0,41.3924,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA2_NTB2_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,466.93,3777.72 +256,256,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441285,0,57.4974,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,672.29,2813.05 +256,512,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,86.7189,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,891.49,1989.08 +256,1024,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440436,0,153.3786,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1008.09,1264.76 +256,2048,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439533,0,244.429,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1265.14,969.52 +256,4096,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440625,0,477.1746,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB2_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1296.12,676.82 +256,8192,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440614,0,908.5186,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1361.5,544.76 +256,16384,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440520,0,1679.2573,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1473.21,499.54 +256,32768,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440620,0,3238.9311,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1527.6,471.37 +256,1,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440308,0,50.3359,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,6.0,6000.51 +256,16,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440308,0,50.8541,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,95.01,5954.47 +256,32,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439915,0,53.3772,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA5_NTB2_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,181.05,5688.35 +256,64,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440176,0,58.9864,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,327.66,5175.2 +256,128,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440779,0,68.3547,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,565.5,4513.86 +256,256,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440842,0,94.4602,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT144x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,818.43,3335.77 +256,512,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440842,0,152.756,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT144x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1012.19,2148.55 +256,1024,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439533,0,262.6958,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1177.17,1349.16 +256,2048,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440282,0,494.0612,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1251.82,823.48 +256,4096,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,937.4531,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1319.48,545.85 +256,8192,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,1638.2233,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1510.11,440.37 +256,16384,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440534,0,3231.1473,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1531.28,353.08 +256,32768,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440454,0,6501.6217,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1522.02,304.5 +256,1,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,14.4737,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,4.64,4639.15 +256,1,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,26.6447,auto,0.0,5.04,5038.85 +256,1,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,43.1686,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.22,6219.44 +256,16,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,13.9792,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,76.81,4842.82 +256,16,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439843,0,27.698,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB2_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,77.53,4869.42 +256,16,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440554,0,43.2882,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC2_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,99.22,6219.29 +256,32,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,14.6606,auto,0.0,146.48,4657.96 +256,32,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,25.4147,auto,0.0,169.0,5332.68 +256,32,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,47.9929,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,178.98,5626.01 +256,64,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441387,0,17.5039,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,245.37,3968.72 +256,64,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439385,0,28.853,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC2_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,297.71,4742.63 +256,64,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440890,0,50.3817,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,340.99,5390.47 +256,128,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439853,0,19.9532,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB0_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,430.5,3599.8 +256,128,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439564,0,40.9366,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA3_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,419.67,3406.75 +256,128,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440858,0,60.7737,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,565.37,4520.49 +256,256,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440209,0,24.2433,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,708.64,3157.41 +256,256,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440456,0,42.8134,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,802.55,3379.86 +256,256,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,78.48,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,875.63,3580.76 +256,512,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440203,0,34.9957,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,981.83,2456.97 +256,512,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441313,0,67.55,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1017.31,2297.4 +256,512,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441313,0,126.5963,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1085.65,2319.19 +256,1024,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440452,0,54.7768,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1254.54,1914.27 +256,1024,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440548,0,107.5718,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1277.65,1637.61 +256,1024,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,193.6001,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1419.82,1646.52 +256,2048,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,115.7963,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1186.9,1231.53 +256,2048,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440491,0,199.3694,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1378.74,1093.97 +256,2048,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440425,0,378.0526,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1454.18,976.32 +256,4096,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440528,0,212.4243,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1294.0,1026.74 +256,4096,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440575,0,374.6172,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1467.51,806.13 +256,4096,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,729.5152,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1507.18,643.94 +256,8192,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440578,0,390.9707,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1406.13,944.06 +256,8192,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,741.8084,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1482.2,633.27 +256,8192,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440483,0,1412.7149,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1556.59,475.03 +256,16384,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440507,0,785.7076,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1399.39,854.12 +256,16384,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440490,0,1462.6083,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1503.49,550.6 +256,16384,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439464,0,2787.2101,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1577.94,385.24 +256,32768,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,1524.068,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1442.86,836.62 +256,32768,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441460,0,2882.9014,Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950,0.0,1525.56,512.12 +256,32768,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440453,0,6064.912,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1450.32,309.82 +256,1,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,68.6404,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.35,6355.83 +256,16,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440291,0,70.933,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,98.39,6162.97 +256,32,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440891,0,71.8762,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,194.2,6095.32 +256,64,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440855,0,80.5537,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,346.57,5462.3 +256,128,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440551,0,95.4707,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB2_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,584.83,4648.65 +256,256,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440226,0,129.7321,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB2_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,860.77,3479.57 +256,512,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440398,0,205.6306,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1086.11,2269.2 +256,1024,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,334.1007,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1336.95,1487.65 +256,2048,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440534,0,640.1636,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1395.51,871.41 +256,4096,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,1277.3354,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1398.78,531.95 +256,8192,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,2356.24,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1516.57,391.62 +256,16384,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,4603.5971,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1552.44,306.13 +256,32768,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,9181.1834,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1556.84,259.48 +256,1,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,35.2163,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.19,6194.57 +256,1,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,64.8462,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.73,6727.72 +256,1,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440146,0,129.1826,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,6.75,6754.02 +256,16,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440907,0,35.6376,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,97.92,6140.74 +256,16,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440291,0,65.4229,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,106.68,6682.03 +256,16,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439876,0,130.1849,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,107.22,6711.93 +256,32,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439637,0,37.104,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,188.1,5917.92 +256,32,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440891,0,67.9373,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,205.46,6448.71 +256,32,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,131.496,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,212.31,6655.47 +256,64,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,40.4189,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,345.35,5469.05 +256,64,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,71.7118,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,389.3,6135.79 +256,64,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,135.5425,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,411.93,6477.08 +256,128,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440068,0,47.8218,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA6_NTB2_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,583.78,4684.1 +256,128,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440209,0,87.6494,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,637.02,5063.47 +256,128,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,157.008,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,711.23,5626.63 +256,256,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,67.7717,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,823.86,3392.28 +256,256,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439559,0,118.6429,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,941.22,3804.8 +256,256,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440767,0,210.7823,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1059.57,4243.41 +256,512,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440203,0,103.243,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1081.61,2341.05 +256,512,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440584,0,192.4251,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1160.65,2424.92 +256,512,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440226,0,344.0903,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB2_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1298.14,2663.42 +256,1024,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,160.8002,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1388.92,1649.81 +256,1024,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,302.0495,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1478.82,1645.51 +256,1024,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,589.1786,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1516.27,1630.23 +256,2048,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,307.7937,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1451.22,1015.21 +256,2048,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,584.2325,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1529.11,954.83 +256,2048,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,1141.2861,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1565.52,918.77 +256,4096,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440508,0,587.1169,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1521.59,692.96 +256,4096,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440508,0,1130.843,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1579.98,600.86 +256,4096,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,2253.5103,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1585.71,543.48 +256,8192,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,1140.9894,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1565.93,522.0 +256,8192,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,2243.6419,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1592.68,411.27 +256,8192,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,4521.6701,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1580.57,348.78 +256,16384,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439467,0,2278.6755,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC3_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1568.2,427.04 +256,16384,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,4505.5017,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1586.24,312.79 +256,16384,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,9044.7089,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1580.33,252.27 +256,32768,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439467,0,4536.6066,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC3_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1575.37,380.91 +256,32768,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,9028.2301,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1583.22,263.88 +256,32768,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,18162.3989,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1573.98,203.22 +256,1,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440308,0,126.776,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,6.88,6882.23 +256,16,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440308,0,128.632,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,108.52,6792.96 +256,32,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439915,0,135.9529,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA5_NTB2_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,205.35,6437.29 +256,64,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,141.6186,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,394.26,6199.19 +256,128,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,163.1593,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,684.42,5414.5 +256,256,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439557,0,226.6243,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA0_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,985.5,3946.78 +256,512,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,349.7922,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1276.98,2620.0 +256,1024,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,655.5621,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1362.73,1465.15 +256,2048,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1315.3294,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1358.37,797.2 +256,4096,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,2340.3623,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1526.86,523.31 +256,8192,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,4560.5649,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1567.09,345.8 +256,16384,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,9129.8796,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1565.59,249.92 +256,32768,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,18289.8411,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1563.02,201.81 +256,1,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440308,0,251.18,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,6.95,6947.09 +256,16,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440308,0,251.5826,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,110.97,6944.27 +256,32,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,258.6972,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,215.83,6761.91 +256,64,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440771,0,281.3671,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,396.88,6232.94 +256,128,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440779,0,286.8183,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,778.68,6145.55 +256,256,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440767,0,391.3087,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1141.49,4550.07 +256,512,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,695.1809,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1285.07,2612.46 +256,1024,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1321.0685,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1352.47,1428.72 +256,2048,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,2386.3828,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1497.42,850.68 +256,4096,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,4533.2258,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1576.54,510.73 +256,8192,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,9070.8882,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1575.77,318.13 +256,16384,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,18214.2616,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1569.5,221.06 +256,32768,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,36485.4726,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1567.05,172.9 diff --git a/aiter/configs/model_configs/llama70B_bf16_tuned_gemm.csv b/aiter/configs/model_configs/llama70B_bf16_tuned_gemm.csv new file mode 100644 index 0000000000..fe2c088d96 --- /dev/null +++ b/aiter/configs/model_configs/llama70B_bf16_tuned_gemm.csv @@ -0,0 +1,157 @@ +cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +256,64,192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,3.7986,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0124,6.63,144.49 +256,1,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,7.3327,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0305,2.86,2862.58 +256,16,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,105,16,8.7513,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0287,38.34,2431.02 +256,32,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,177,8,12.0725,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0214,55.59,1787.35 +256,64,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6099,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,98.62,1629.99 +256,128,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,15.0191,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.018,178.73,1557.77 +256,256,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439930,0,17.7489,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA3_NTB1_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,302.48,1454.8 +256,512,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440320,0,23.6725,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_12_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,453.58,1295.63 +256,1024,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440366,0,33.9749,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA3_NTB7_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,632.08,1188.24 +256,2048,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441305,0,52.6186,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,816.25,1135.89 +256,4096,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440465,0,89.5077,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,959.69,1101.2 +256,8192,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440465,0,147.843,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1162.03,1191.54 +256,16384,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440417,0,267.8494,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA1_NTB2_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1282.8,1237.08 +256,32768,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440527,0,490.6775,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1400.5,1307.84 +256,1,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,10.3178,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0312,4.07,4067.2 +256,16,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,93,8,12.6898,flydsl_gemm2_abf16_wbf16_bf16_t16x128x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0214,52.88,3332.37 +256,32,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6964,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0221,97.99,3112.58 +256,64,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441380,0,15.5877,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,172.21,2779.07 +256,128,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439851,0,18.0634,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA3_NTB1_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,297.21,2474.37 +256,256,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441367,0,24.629,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,435.97,1926.51 +256,512,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,34.5297,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,621.92,1533.55 +256,1024,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440417,0,53.207,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA1_NTB2_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,807.22,1202.16 +256,2048,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439562,0,91.6693,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,937.06,937.97 +256,4096,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441305,0,138.3141,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1242.09,940.06 +256,8192,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440452,0,268.4801,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1279.79,812.36 +256,16384,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440606,0,484.6137,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB0_NTC7_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1418.03,813.56 +256,32768,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,909.8416,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1510.58,820.57 +256,1,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,103,8,6.3379,flydsl_gemm2_abf16_wbf16_bf16_t16x256x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0112,2.65,2650.03 +256,1,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440304,0,8.761,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC2_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,3.83,3832.32 +256,1,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440286,0,23.1091,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB2_NTC3_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,5.81,5809.42 +256,16,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439772,0,6.5729,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,40.84,2597.35 +256,16,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439977,0,9.1174,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,58.88,3716.2 +256,16,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440553,0,24.1955,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,88.76,5568.89 +256,32,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.1053,auto,0.0,75.56,2444.24 +256,32,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440174,0,9.8977,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,108.48,3456.34 +256,32,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439842,0,27.1079,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,158.44,4989.92 +256,64,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441407,0,12.0362,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,178.42,2896.69 +256,64,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439986,0,30.9767,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,277.3,4400.56 +256,128,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440262,0,9.1681,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,234.23,2087.29 +256,128,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441404,0,13.5214,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,317.64,2675.45 +256,128,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439646,0,37.1858,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,462.0,3722.17 +256,256,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440196,0,11.4458,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA4_NTB2_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,375.24,1878.05 +256,256,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440054,0,17.9759,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,477.86,2158.3 +256,256,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440197,0,47.9098,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA5_NTB0_NTC6_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,717.18,2976.56 +256,512,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439871,0,15.129,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,567.78,1732.73 +256,512,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440209,0,22.6168,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,759.61,1947.23 +256,512,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440298,0,75.6048,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB3_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,908.93,1997.16 +256,1024,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440203,0,20.4515,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,840.03,1743.23 +256,1024,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440412,0,34.7945,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,987.5,1567.09 +256,1024,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440491,0,121.6111,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1130.15,1379.58 +256,2048,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440620,0,31.4274,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1093.31,1734.98 +256,2048,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440543,0,54.003,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1272.51,1398.02 +256,2048,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440503,0,189.4633,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1450.82,1062.62 +256,4096,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,61.2463,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1122.02,1506.62 +256,4096,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440335,0,104.9472,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1309.6,1119.04 +256,4096,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440333,0,363.0173,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC5_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1514.41,739.46 +256,8192,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,122.5229,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1121.74,1369.31 +256,8192,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440543,0,204.6234,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1343.34,983.89 +256,8192,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,710.6239,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1547.25,566.62 +256,16384,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,232.4281,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1182.64,1371.47 +256,16384,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440544,0,395.3056,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB0_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1390.71,933.7 +256,16384,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440502,0,1407.8259,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1562.0,476.68 +256,32768,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440528,0,453.3644,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1212.61,1369.22 +256,32768,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,779.1373,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1411.19,904.39 +256,32768,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,3026.2195,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1453.31,399.16 +256,1,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,29.3707,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,5.71,5713.48 +256,16,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,30.8036,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,87.14,5465.66 +256,32,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,31.1015,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,172.62,5432.27 +256,64,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439915,0,34.2047,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA5_NTB2_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,313.92,4973.92 +256,128,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440299,0,40.4695,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,530.64,4262.24 +256,256,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440791,0,57.6827,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,744.58,3072.14 +256,512,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439562,0,92.2564,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,931.09,2023.13 +256,1024,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441305,0,149.9052,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1146.05,1371.01 +256,2048,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440465,0,293.505,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1170.67,828.84 +256,4096,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,475.9574,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1443.82,669.74 +256,8192,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440450,0,899.5932,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1527.79,522.19 +256,16384,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,1791.5084,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1534.34,430.78 +256,32768,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440534,0,3495.0812,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1572.94,393.62 +256,1,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440303,0,20.5725,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,5.71,5710.11 +256,16,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,22.3125,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,84.22,5285.47 +256,32,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441408,0,25.9201,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,144.99,4568.79 +256,64,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439593,0,29.8084,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,252.15,4005.8 +256,128,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439908,0,35.1308,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,427.9,3454.88 +256,256,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440341,0,45.2671,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,664.16,2768.12 +256,512,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440657,0,71.5815,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,840.02,1860.39 +256,1024,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440203,0,116.622,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1031.19,1276.76 +256,2048,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,180.181,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1334.87,1000.97 +256,4096,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,339.9066,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1415.2,715.7 +256,8192,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440518,0,648.7957,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC5_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1482.86,568.9 +256,16384,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441281,0,1222.0475,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1574.53,507.96 +256,32768,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440524,0,2436.0256,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1579.74,461.44 +256,1,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439977,0,12.1146,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,4.85,4849.01 +256,1,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440286,0,20.3313,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB2_NTC3_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,5.78,5777.85 +256,1,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440305,0,72.551,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,6.47,6475.94 +256,16,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,12.6982,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,73.99,4653.97 +256,16,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440553,0,21.2748,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,88.32,5543.27 +256,16,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440306,0,73.9187,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2,0.0,101.68,6371.08 +256,32,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440111,0,13.6521,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,137.64,4356.39 +256,32,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440174,0,24.2667,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,154.87,4880.08 +256,32,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,76.3662,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,196.85,6182.33 +256,64,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441407,0,16.7778,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,223.99,3589.72 +256,64,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439593,0,27.857,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,269.81,4286.41 +256,64,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,82.9978,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,362.24,5716.79 +256,128,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439919,0,20.4535,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,367.48,3018.31 +256,128,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440108,0,33.0489,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,454.85,3672.52 +256,128,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,93.8224,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,640.89,5107.51 +256,256,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440341,0,26.5094,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,567.06,2442.51 +256,256,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440341,0,43.8206,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,686.09,2859.5 +256,256,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439559,0,128.3281,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,937.12,3807.71 +256,512,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440209,0,35.1572,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,855.15,2013.21 +256,512,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,66.7104,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,901.35,1996.23 +256,512,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440453,0,208.9587,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1151.03,2428.76 +256,1024,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439453,0,56.7205,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB1_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1060.1,1460.45 +256,1024,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440483,0,113.4358,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1060.15,1312.62 +256,1024,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440440,0,353.5274,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1360.68,1542.34 +256,2048,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439467,0,92.7091,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC3_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1297.17,1153.66 +256,2048,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,172.3823,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1395.26,1046.25 +256,2048,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440505,0,618.4741,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC4_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1555.56,1003.69 +256,4096,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,177.0024,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1358.84,876.76 +256,4096,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,324.9569,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1480.31,748.62 +256,4096,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,1216.919,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1581.16,634.19 +256,8192,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,333.1274,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1444.0,755.44 +256,8192,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441281,0,627.6619,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1532.79,588.05 +256,8192,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,2437.242,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1578.95,440.56 +256,16384,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,648.9517,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1482.5,685.1 +256,16384,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440334,0,1222.429,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1574.03,507.81 +256,16384,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439464,0,4914.4576,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1566.11,341.38 +256,32768,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439467,0,1264.7815,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC3_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1521.33,656.61 +256,32768,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,2428.4901,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1584.64,462.87 +256,32768,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439464,0,10067.9005,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1528.93,286.62 +256,1,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,38.458,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.11,6108.64 +256,16,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,39.1379,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,96.02,6019.79 +256,32,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440891,0,40.6542,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,184.88,5813.0 +256,64,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440552,0,45.351,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA5_NTB3_NTC5_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,331.47,5242.76 +256,128,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440456,0,52.2152,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,575.79,4608.78 +256,256,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440799,0,75.2632,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,798.92,3274.05 +256,512,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440799,0,119.4186,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1007.04,2160.05 +256,1024,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,181.075,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1328.28,1551.94 +256,2048,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,345.903,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1390.67,945.8 +256,4096,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440483,0,653.8876,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1471.31,641.44 +256,8192,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440508,0,1221.8425,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1574.79,494.32 +256,16384,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,2425.2544,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1586.76,401.23 +256,32768,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,4832.0372,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1592.82,354.15 +256,1,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440894,0,139.7265,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,6.72,6724.96 +256,16,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440894,0,138.785,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,108.31,6784.75 +256,32,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440894,0,150.5846,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,199.65,6267.03 +256,64,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440848,0,154.0302,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,390.38,6154.07 +256,128,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440564,0,170.6136,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA6_NTB2_NTC2_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,704.86,5605.07 +256,256,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,214.4058,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1121.79,4538.49 +256,512,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440226,0,377.8892,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB2_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1272.96,2663.83 +256,1024,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,693.132,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1388.01,1549.12 +256,2048,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,1246.7409,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1543.34,968.89 +256,4096,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,2455.0899,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1567.47,601.36 +256,8192,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439464,0,4823.8046,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1595.54,417.36 +256,16384,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,9740.0474,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1580.4,316.94 +256,32768,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440397,0,20959.1243,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1468.87,249.75 diff --git a/aiter/configs/model_configs/qwen32B_bf16_tuned_gemm.csv b/aiter/configs/model_configs/qwen32B_bf16_tuned_gemm.csv new file mode 100644 index 0000000000..7e64b7133a --- /dev/null +++ b/aiter/configs/model_configs/qwen32B_bf16_tuned_gemm.csv @@ -0,0 +1,157 @@ +cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +256,1,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,7.8782,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,0.13,131.3 +256,16,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439772,0,7.8731,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,2.08,151.28 +256,32,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,8.6024,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,3.81,157.87 +256,64,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,8.5931,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,7.63,196.92 +256,128,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440159,0,8.272,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,15.85,285.34 +256,256,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440159,0,8.44,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,31.06,437.99 +256,512,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440159,0,8.4448,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,62.08,754.23 +256,1024,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439552,0,11.4461,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,91.61,1023.45 +256,2048,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440043,0,17.0309,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,123.14,1315.56 +256,4096,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440247,0,19.8299,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC3_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,211.51,2208.09 +256,8192,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439712,0,26.1299,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,321.03,3312.24 +256,16384,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440266,0,40.5725,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB1_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,413.51,4241.12 +256,32768,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440677,0,64.2196,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,522.5,5342.95 +256,1,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439633,0,7.9325,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,0.26,259.52 +256,16,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440303,0,8.2209,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,3.99,269.83 +256,32,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,8.6174,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,7.61,277.17 +256,64,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,8.5341,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,15.36,319.77 +256,128,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,8.6099,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,30.45,396.05 +256,256,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440159,0,8.8701,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,59.11,537.97 +256,512,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440162,0,13.1134,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC3_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2,0.0,79.96,571.6 +256,1024,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440174,0,17.8971,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,117.18,723.21 +256,2048,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439385,0,19.9217,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC2_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,210.54,1196.62 +256,4096,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440263,0,23.8764,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,351.33,1911.07 +256,8192,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439924,0,35.2051,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,476.56,2534.03 +256,16384,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440678,0,54.3605,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,617.26,3244.52 +256,32768,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441036,0,89.8285,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,747.08,3904.1 +256,1,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440287,0,8.2747,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,0.99,991.44 +256,16,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440041,0,8.5973,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,15.25,974.89 +256,32,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440181,0,9.3413,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC3_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,28.06,917.53 +256,64,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,11.5678,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,45.32,773.68 +256,128,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440641,0,16.4653,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,63.68,589.57 +256,256,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440599,0,17.4227,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA1_NTB1_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,120.37,644.16 +256,512,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439593,0,19.6213,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,213.76,726.46 +256,1024,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440109,0,23.1102,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA2_NTB4_NTC2_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,362.98,879.1 +256,2048,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439924,0,31.3993,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,534.32,1033.15 +256,4096,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440199,0,46.6004,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,720.05,1216.48 +256,8192,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441307,0,76.0186,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,882.8,1383.68 +256,16384,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441354,0,130.4167,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT208x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1029.15,1550.25 +256,32768,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441289,0,243.9808,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1100.23,1623.76 +256,1,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,2,1,6.9902,_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,0.0,0.94,939.19 +256,1,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440554,0,7.5725,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC2_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,1.73,1732.59 +256,1,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.6919,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0076,2.8,2804.05 +256,1,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,122,16,12.4189,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0281,4.22,4223.34 +256,1,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,17.6089,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0096,3.72,3723.06 +256,1,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,44.5404,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,5.89,5886.91 +256,16,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.2193,auto,0.0,14.52,933.32 +256,16,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,7.5339,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,27.84,1766.95 +256,16,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.2735,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,42.72,2691.51 +256,16,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.905,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0203,60.33,3794.07 +256,16,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439634,0,18.082,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC1_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,57.99,3644.76 +256,16,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,45.7963,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,91.59,5745.6 +256,32,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,7.4363,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0014,28.2,930.87 +256,32,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439705,0,8.1447,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,51.5,1659.58 +256,32,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.8238,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,81.77,2596.77 +256,32,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,16.1933,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0091,103.61,3278.16 +256,32,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,19.8551,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.01,105.62,3337.85 +256,32,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,47.4404,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,176.82,5567.2 +256,64,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,2,1,7.9464,_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,0.0,52.78,917.51 +256,64,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.95,auto,0.0,93.73,1556.02 +256,64,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,13.6874,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.008,153.22,2471.83 +256,64,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,17.4742,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0093,192.02,3075.36 +256,64,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,20.0034,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.01,209.68,3349.96 +256,64,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,50.9005,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,329.61,5227.38 +256,128,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.3244,auto,0.0,114.53,1096.08 +256,128,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,10.9761,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,152.85,1343.43 +256,128,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439640,0,18.3007,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC3_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,229.19,1906.92 +256,128,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440177,0,21.2988,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,315.08,2584.66 +256,128,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440330,0,24.364,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,344.3,2810.91 +256,128,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441367,0,60.0469,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,558.8,4496.62 +256,256,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439439,0,8.6844,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,193.19,1094.23 +256,256,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,12.642,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,265.42,1296.0 +256,256,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440513,0,20.6512,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,406.2,1793.01 +256,256,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440177,0,27.57,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,486.83,2091.83 +256,256,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440177,0,31.4887,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,532.8,2268.57 +256,256,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441305,0,82.4325,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,814.11,3370.91 +256,512,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,295,1,11.9433,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,280.95,1042.58 +256,512,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,16.5314,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,405.95,1189.3 +256,512,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439930,0,26.3863,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA3_NTB1_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,635.83,1564.74 +256,512,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440128,0,40.582,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA2_NTB2_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,661.46,1550.31 +256,512,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439720,0,48.0074,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,698.94,1610.84 +256,512,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,132.0767,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1016.21,2222.96 +256,1024,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,15.2257,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,440.76,1205.2 +256,1024,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440010,0,21.0484,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA2_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,637.66,1245.43 +256,1024,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440211,0,39.4204,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB3_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,851.19,1263.49 +256,1024,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441315,0,64.7422,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,829.24,1133.73 +256,1024,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439562,0,75.3599,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,890.51,1182.71 +256,1024,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440791,0,241.1581,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1113.11,1347.91 +256,2048,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440012,0,19.2509,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,697.2,1565.98 +256,2048,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439563,0,28.7925,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA6_NTB1_NTC2_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,932.31,1365.69 +256,2048,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440570,0,63.4339,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB2_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1057.93,1053.8 +256,2048,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441305,0,104.1995,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1030.47,905.68 +256,2048,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440465,0,125.3499,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1070.74,899.26 +256,2048,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439562,0,430.228,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1247.88,901.78 +256,4096,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440075,0,33.5611,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,799.84,1601.24 +256,4096,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439667,0,55.7664,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,962.71,1175.19 +256,4096,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439667,0,126.9173,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1057.52,795.21 +256,4096,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440570,0,191.3972,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB2_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1122.0,712.21 +256,4096,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439667,0,235.1316,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1141.64,680.08 +256,4096,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439562,0,842.7454,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1274.1,609.68 +256,8192,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439738,0,58.607,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB3_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,916.05,1722.07 +256,8192,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440012,0,100.1325,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1072.32,1178.09 +256,8192,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,214.5098,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1251.39,788.23 +256,8192,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,325.1496,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1320.92,677.23 +256,8192,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,390.312,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1375.49,651.48 +256,8192,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440614,0,1432.3923,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1499.23,534.39 +256,16384,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440622,0,112.4117,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,955.19,1737.34 +256,16384,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440282,0,177.9873,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1206.54,1251.9 +256,16384,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,383.8016,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1398.82,795.72 +256,16384,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440528,0,578.0575,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1486.0,671.17 +256,16384,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441281,0,707.7525,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1517.11,625.96 +256,16384,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440333,0,2737.4956,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC5_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1568.94,463.48 +256,32768,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441460,0,209.1223,Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950,0.0,1026.9,1836.44 +256,32768,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,340.8883,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1259.93,1268.85 +256,32768,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,734.3333,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1462.2,787.15 +256,32768,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441281,0,1129.886,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1520.5,640.35 +256,32768,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440282,0,1391.9543,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1542.78,589.47 +256,32768,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440333,0,5550.579,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC5_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1547.57,409.94 +256,1,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,13.88,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0175,4.72,4723.27 +256,16,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,15.9358,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0123,65.8,4135.63 +256,32,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,18.6325,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.012,112.55,3556.86 +256,64,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439711,0,21.4847,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA1_NTB0_NTC3_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,195.22,3118.99 +256,128,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440339,0,24.1627,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,347.17,2834.33 +256,256,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441293,0,33.4677,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,501.3,2134.42 +256,512,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439861,0,47.4796,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x112x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,706.71,1628.75 +256,1024,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441300,0,71.3668,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,940.34,1248.89 +256,2048,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,116.1196,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1155.86,970.74 +256,4096,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440507,0,221.2645,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1213.19,722.7 +256,8192,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440656,0,410.1949,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1308.82,619.9 +256,16384,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440271,0,759.8253,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1413.14,583.06 +256,32768,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,1433.3354,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1498.24,572.45 +256,1,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439843,0,24.7644,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB2_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,5.29,5294.21 +256,16,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440291,0,23.8402,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,87.97,5521.99 +256,32,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440651,0,26.2863,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,159.56,5029.95 +256,64,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440263,0,30.706,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,273.19,4343.31 +256,128,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440341,0,38.9554,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,430.68,3482.43 +256,256,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440199,0,52.1604,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,643.29,2688.76 +256,512,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441293,0,80.6816,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,831.77,1852.0 +256,1024,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,117.1051,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1146.13,1432.66 +256,2048,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,220.2568,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1218.74,928.34 +256,4096,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,418.3034,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1283.45,664.28 +256,8192,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,756.7141,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1418.95,561.21 +256,16384,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,1427.7239,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1504.13,503.09 +256,32768,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441281,0,2787.5053,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1540.79,468.33 +256,1,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,80.714,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.5,6497.02 +256,16,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440849,0,81.1557,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,103.36,6482.48 +256,32,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440849,0,81.4925,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,205.87,6477.8 +256,64,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440848,0,83.8,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,400.41,6342.45 +256,128,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440779,0,92.0317,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,729.19,5853.48 +256,256,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439471,0,127.7778,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB1_NTC5_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1050.4,4328.79 +256,512,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,229.3064,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1170.64,2537.91 +256,1024,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,443.2279,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1211.28,1443.12 +256,2048,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,785.4766,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1366.99,961.17 +256,4096,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,1438.3188,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1493.05,685.29 +256,8192,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439609,0,2778.2855,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1545.91,520.84 +256,16384,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439464,0,5567.7519,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1542.8,425.63 +256,32768,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440454,0,11384.5906,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1509.05,370.26 From 1eb918047180217c36e95f4b845b0b2129c99d2b Mon Sep 17 00:00:00 2001 From: sunway513 Date: Tue, 14 Apr 2026 18:11:51 +0000 Subject: [PATCH 2/2] feat: retune BF16 GEMM without hipBLASLt, add GLM-5 and 3 new models Re-tuned all BF16 GEMM configs on MI355X (gfx950) with --libtype asm,triton,flydsl (no hipBLASLt). Added GLM-5 (88 shapes from CI log) and new configs for Llama 70B, Llama 405B, Qwen 32B. Backend wins across 796 total shapes (7 models): - ASM: 437 (54.9%) - FlyDSL: 224 (28.1%) - Triton: 135 (17.0%) Per-model breakdown: - GPT-OSS (57): asm=54, triton=3 (bias=True, no FlyDSL support) - DSV3 (58): flydsl=22, triton=18, asm=18 - Kimi-K2 (125): asm=77, flydsl=46, triton=2 - GLM-5 (88): asm=42, flydsl=30, triton=16 - Llama 70B (156): asm=84, flydsl=49, triton=23 - Llama 405B (156): asm=89, flydsl=43, triton=24 - Qwen 32B (156): asm=73, triton=49, flydsl=34 Tuning time without hipBLASLt: 4h total (long pole: 405B @ 4h) vs with hipBLASLt: 10h+ total (long pole: 405B @ 8h+) --- .../model_configs/dsv3_bf16_tuned_gemm.csv | 116 +++---- .../model_configs/glm5_bf16_tuned_gemm.csv | 89 +++++ .../model_configs/glm5_untuned_gemm_bf16.csv | 89 +++++ .../model_configs/gptoss_bf16_tuned_gemm.csv | 114 +++---- .../model_configs/kimik2_bf16_tuned_gemm.csv | 250 +++++++------- .../llama405B_bf16_tuned_gemm.csv | 312 +++++++++--------- .../llama70B_bf16_tuned_gemm.csv | 312 +++++++++--------- .../model_configs/qwen32B_bf16_tuned_gemm.csv | 312 +++++++++--------- 8 files changed, 886 insertions(+), 708 deletions(-) create mode 100644 aiter/configs/model_configs/glm5_bf16_tuned_gemm.csv create mode 100644 aiter/configs/model_configs/glm5_untuned_gemm_bf16.csv diff --git a/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv b/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv index 014f64c631..36e2d91b5e 100644 --- a/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv @@ -1,59 +1,59 @@ cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,1,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4051,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0117,0.5,497.61 -256,2,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4651,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0254,0.98,495.6 -256,4,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4101,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0361,1.98,503.29 -256,8,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.5587,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0342,3.88,501.25 -256,16,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,7.6922,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0281,7.63,507.99 -256,32,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,7.7252,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.032,15.2,536.58 -256,48,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.9759,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0295,22.09,549.5 -256,64,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,8.4443,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0322,27.82,547.15 -256,80,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,16,8.669,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0322,33.87,560.37 -256,96,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,16,9.1151,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0325,38.65,559.01 -256,112,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,9.5249,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0315,43.15,559.9 -256,128,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.7811,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0221,48.03,569.52 -256,256,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441168,0,11.9774,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,78.44,623.77 -256,1,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.1742,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0303,3.3,3302.33 -256,2,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,292,16,9.1774,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0284,6.6,3303.2 -256,80,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,15.6657,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0183,154.62,2027.51 -256,112,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440339,0,17.7572,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,190.97,1822.15 -256,128,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439770,0,18.1517,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB1_NTC7_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,213.51,1798.91 -256,256,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440339,0,22.0156,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,352.07,1591.1 -256,1,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439841,0,6.0249,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,1.57,1567.89 -256,2,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440041,0,5.6299,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,3.35,1679.54 -256,4,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440906,0,5.7354,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,6.58,1651.85 -256,8,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440305,0,5.8171,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,12.98,1634.99 -256,16,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440041,0,6.3749,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,23.69,1503.5 -256,32,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440288,0,6.8235,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2,0.0,44.26,1426.26 -256,48,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439842,0,7.3297,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,61.8,1347.88 -256,64,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439842,0,7.2513,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,83.29,1382.79 -256,80,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440291,0,8.8113,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,85.68,1154.71 -256,96,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439773,0,8.9138,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC3_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,101.64,1157.97 -256,112,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440247,0,9.6383,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC3_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,109.66,1086.23 -256,128,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440045,0,9.5386,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,126.64,1113.04 -256,256,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439779,0,10.6106,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,227.69,1111.76 -256,1,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439772,0,8.6754,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,3.38,3386.42 -256,2,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,8.4386,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0144,6.96,3483.63 -256,4,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440181,0,8.4056,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC3_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,13.97,3501.7 -256,8,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439977,0,8.9689,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,26.19,3289.99 -256,16,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,9.284,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,50.6,3194.21 -256,32,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440111,0,9.6079,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,97.79,3117.22 -256,48,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439773,0,10.8192,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC3_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,130.26,2795.48 -256,64,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439637,0,11.7601,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,159.78,2596.9 -256,80,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441443,0,12.4433,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,188.76,2478.02 -256,96,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439777,0,12.722,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB1_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,221.55,2446.91 -256,112,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440890,0,13.776,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,238.7,2281.11 -256,128,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440890,0,13.8942,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,270.48,2282.93 -256,256,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440054,0,16.5758,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,453.44,2055.93 -256,1,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,36.8269,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.29,6292.04 -256,2,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,36.7342,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,12.61,6309.19 -256,4,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,37.0902,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,24.98,6251.15 -256,8,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,36.9536,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,50.15,6279.31 -256,16,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440907,0,37.6556,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,98.44,6172.16 -256,32,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440891,0,38.4907,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,192.6,6057.64 -256,48,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440929,0,40.0143,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA4_NTB0_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,277.9,5845.64 -256,64,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440890,0,42.0445,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,352.65,5581.13 -256,80,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439982,0,44.2897,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x80x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_5_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,418.46,5315.06 -256,96,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440514,0,44.5354,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,499.38,5302.5 -256,112,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440209,0,46.6162,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,556.61,5081.82 -256,128,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440209,0,46.3567,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,639.69,5126.37 -256,256,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,73.0382,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,812.01,3335.43 +256,1,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4782,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0195,0.49,492.75 +256,2,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4609,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0332,0.98,495.88 +256,4,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.5195,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0293,1.95,495.96 +256,8,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.5756,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0273,3.88,500.13 +256,16,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.6807,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0308,7.65,508.75 +256,32,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.8114,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0326,15.03,530.65 +256,48,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,14,8.172,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0277,21.56,536.31 +256,64,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,8.4622,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0312,27.76,545.99 +256,80,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,16,8.7848,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0322,33.42,552.98 +256,96,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,16,8.9146,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0306,39.52,571.58 +256,112,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,9.5363,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0296,43.1,559.23 +256,128,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.2913,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0224,45.65,541.29 +256,256,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,287,8,12.8749,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.023,72.97,580.28 +256,1,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.2196,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0265,3.28,3286.06 +256,2,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.2149,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0282,6.57,3289.75 +256,80,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,15.4972,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0178,156.3,2049.56 +256,112,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,19.3948,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0106,174.85,1668.3 +256,128,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,19.5107,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0106,198.64,1673.61 +256,256,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,28.0092,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.006,276.73,1250.62 +256,1,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,6.3014,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0169,1.5,1499.1 +256,2,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,6.4095,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0163,2.94,1475.25 +256,4,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,6.5084,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0136,5.8,1455.66 +256,8,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,6.6916,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0064,11.28,1421.32 +256,16,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,6.7833,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0077,22.26,1412.98 +256,32,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,7.7348,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0077,39.04,1258.22 +256,48,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,3,8.9019,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0058,50.89,1109.83 +256,64,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,8.9485,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0027,67.5,1120.52 +256,80,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,9.4262,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0077,80.09,1079.38 +256,96,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,2,9.8468,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0027,92.01,1048.25 +256,112,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,10.092,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0028,104.73,1037.39 +256,128,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,10.0569,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0028,120.11,1055.68 +256,256,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,12.3425,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,195.74,955.76 +256,1,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,8.3747,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0151,3.51,3508.01 +256,2,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,8.5564,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0158,6.86,3435.67 +256,4,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,244,8,9.2613,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0139,12.68,3178.16 +256,8,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,114,4,9.2997,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0084,25.26,3172.96 +256,16,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,9.4278,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0088,49.83,3145.49 +256,32,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.3011,auto,0.0,83.14,2650.18 +256,48,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.3303,auto,0.0,114.29,2452.89 +256,64,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.0008,auto,0.0,156.58,2544.81 +256,80,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,13.9539,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0034,168.33,2209.75 +256,96,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,14.1437,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0034,199.28,2200.95 +256,112,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,15.4791,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,212.44,2030.13 +256,128,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,15.4483,auto,0.0,243.27,2053.26 +256,256,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.9002,auto,0.0,343.2,1556.09 +256,1,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.8589,auto,0.0,5.05,5052.81 +256,2,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.5282,auto,0.0,10.18,5090.54 +256,4,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.1821,auto,0.0,20.51,5131.6 +256,8,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.1407,auto,0.0,41.06,5140.44 +256,16,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,39.1932,auto,0.0,94.58,5930.01 +256,32,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,41.086,auto,0.0,180.44,5674.99 +256,48,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.4891,auto,0.0,169.8,3571.73 +256,64,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,66.227,auto,0.0,223.88,3543.2 +256,80,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.8314,auto,0.0,244.41,3104.28 +256,96,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.5926,auto,0.0,294.21,3123.97 +256,112,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,77.7086,auto,0.0,333.9,3048.51 +256,128,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,77.9492,auto,0.0,380.42,3048.67 +256,256,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,135.7516,auto,0.0,436.88,1794.55 diff --git a/aiter/configs/model_configs/glm5_bf16_tuned_gemm.csv b/aiter/configs/model_configs/glm5_bf16_tuned_gemm.csv new file mode 100644 index 0000000000..e865d7c8ac --- /dev/null +++ b/aiter/configs/model_configs/glm5_bf16_tuned_gemm.csv @@ -0,0 +1,89 @@ +cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +256,1,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.045,auto,0.0,0.02,17.6 +256,2,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.0376,auto,0.0,0.03,18.14 +256,4,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.0018,auto,0.0,0.07,19.24 +256,8,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.0751,auto,0.0,0.14,21.32 +256,16,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,16.6651,auto,0.0,0.38,35.45 +256,32,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,18.2784,auto,0.0,0.69,43.14 +256,48,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,22.8772,auto,0.0,0.83,43.1 +256,64,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.1289,auto,0.0,1.09,51.18 +256,128,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,34.4386,auto,0.0,1.46,57.33 +256,256,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,55.8106,auto,0.0,1.8,63.7 +256,16384,32,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,116.3523,auto,0.0,55.37,1742.71 +256,1,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,5.388,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0469,0.29,294.25 +256,2,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,5.4358,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0273,0.58,293.97 +256,4,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,5.433,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0293,1.16,298.74 +256,8,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,5.4742,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0322,2.3,305.65 +256,16,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,6.0986,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0288,4.13,290.82 +256,32,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,6.0842,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0237,8.27,324.49 +256,48,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,16,5.9414,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.028,12.71,366.07 +256,64,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,6.8395,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0316,14.72,347.35 +256,128,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,7.8346,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0308,25.7,405.7 +256,256,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,12,9.9652,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0276,40.41,480.08 +256,16384,128,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,65.0045,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,396.43,3185.84 +256,1,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,12,7.3627,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0391,0.43,428.99 +256,2,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,7.5667,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0273,0.83,419.12 +256,4,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,7.2016,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0254,1.75,443.92 +256,8,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,7.3839,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0303,3.41,439.89 +256,16,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,7.7579,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0291,6.49,431.89 +256,32,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,7.6427,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0272,13.17,465.19 +256,48,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,13,7.7081,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0267,19.59,487.82 +256,64,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,15,8.0088,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0294,25.14,495.07 +256,128,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,12,10.0663,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0257,40.0,475.26 +256,256,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,8,12.1464,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0203,66.3,528.76 +256,16384,256,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,86.0749,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,598.78,2472.97 +256,1,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,362,16,12.1556,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0278,2.65,2654.02 +256,2,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,11.1667,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.028,5.77,2890.63 +256,4,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,292,16,11.8259,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0291,10.91,2732.46 +256,8,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,277,16,12.4589,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0304,20.7,2599.27 +256,16,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.6223,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0203,44.39,2798.44 +256,32,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.5445,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0204,76.18,2422.01 +256,48,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.2686,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0176,108.47,2318.76 +256,64,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.6224,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0175,141.13,2281.84 +256,128,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,18.7174,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0101,220.5,1842.58 +256,256,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,25.7364,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0057,320.73,1427.28 +256,16384,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,75,1,758.9775,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,696.04,421.03 +256,1,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.3424,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.009,1.8,1797.13 +256,2,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.0829,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,3.69,1849.83 +256,4,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.3991,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0079,7.14,1790.21 +256,8,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.2796,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0091,14.46,1818.56 +256,16,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.7359,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0088,27.57,1743.43 +256,32,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.7262,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0088,55.2,1765.38 +256,48,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,4,10.1698,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0089,79.19,1707.71 +256,64,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,4,10.4146,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0089,103.1,1686.44 +256,128,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,11.8652,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0034,180.99,1546.55 +256,256,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,16.5474,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,259.56,1203.99 +256,16384,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,213.9689,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1284.66,1019.32 +256,1,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,82,8,12.0812,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0163,3.12,3126.11 +256,1,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,12.0447,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0176,4.18,4180.44 +256,1,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,16.2005,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0205,4.66,4661.71 +256,2,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,290,16,11.5991,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0295,6.51,3257.63 +256,2,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,85,16,12.6809,flydsl_gemm2_abf16_wbf16_bf16_t16x128x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0273,7.94,3972.32 +256,2,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,16.3454,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0201,9.24,4621.89 +256,4,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,387,8,12.6496,flydsl_gemm2_abf16_wbf16_bf16_t96x64x64_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0177,11.94,2990.01 +256,4,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.3794,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0199,16.26,4072.38 +256,4,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,16.5427,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0203,18.26,4569.74 +256,8,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,114,4,12.8874,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0106,23.43,2940.56 +256,8,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,13.2214,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.012,30.45,3819.22 +256,8,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,16.9831,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,35.56,4457.02 +256,16,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.6886,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0179,47.6,2998.25 +256,16,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,91,4,13.4396,flydsl_gemm2_abf16_wbf16_bf16_t16x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0119,59.92,3769.41 +256,16,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,17.8209,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0131,67.78,4258.52 +256,32,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,13.4164,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0106,90.04,2857.59 +256,32,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,15.3685,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0118,104.8,3317.63 +256,32,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,19.7695,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0132,122.2,3858.67 +256,48,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,2,15.8977,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0042,113.97,2430.13 +256,48,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,19.2217,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0116,125.69,2669.62 +256,48,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,23.8084,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0133,152.21,3220.59 +256,64,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,16.398,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.004,147.33,2373.97 +256,64,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,19.8091,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0049,162.61,2607.0 +256,64,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,25.8376,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0056,187.01,2982.88 +256,128,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.581,auto,0.0,223.89,1858.49 +256,128,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,27.5675,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,233.7,1920.85 +256,128,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,38.984,auto,0.0,247.89,2017.32 +256,256,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,29.8896,auto,0.0,323.31,1420.81 +256,256,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,37.6774,auto,0.0,341.98,1475.01 +256,256,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,55.4977,auto,0.0,348.26,1473.74 +256,16384,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,441.2049,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1401.79,770.02 +256,16384,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,824.3818,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1500.46,580.01 +256,1,38720,6144,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,170,1,89.7892,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,5.3,5299.98 diff --git a/aiter/configs/model_configs/glm5_untuned_gemm_bf16.csv b/aiter/configs/model_configs/glm5_untuned_gemm_bf16.csv new file mode 100644 index 0000000000..4756918a0f --- /dev/null +++ b/aiter/configs/model_configs/glm5_untuned_gemm_bf16.csv @@ -0,0 +1,89 @@ +M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle +128,128,6144,False,torch.bfloat16,torch.bfloat16,False,False +128,256,6144,False,torch.bfloat16,torch.bfloat16,False,False +128,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False +128,32,6144,False,torch.bfloat16,torch.bfloat16,False,False +128,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False +128,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False +128,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False +128,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False +16384,128,6144,False,torch.bfloat16,torch.bfloat16,False,False +16384,256,6144,False,torch.bfloat16,torch.bfloat16,False,False +16384,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False +16384,32,6144,False,torch.bfloat16,torch.bfloat16,False,False +16384,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False +16384,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False +16384,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False +16,128,6144,False,torch.bfloat16,torch.bfloat16,False,False +16,256,6144,False,torch.bfloat16,torch.bfloat16,False,False +16,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False +16,32,6144,False,torch.bfloat16,torch.bfloat16,False,False +16,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False +16,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False +16,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False +16,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False +1,128,6144,False,torch.bfloat16,torch.bfloat16,False,False +1,256,6144,False,torch.bfloat16,torch.bfloat16,False,False +1,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False +1,32,6144,False,torch.bfloat16,torch.bfloat16,False,False +1,38720,6144,False,torch.bfloat16,torch.bfloat16,False,False +1,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False +1,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False +1,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False +1,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False +256,128,6144,False,torch.bfloat16,torch.bfloat16,False,False +256,256,6144,False,torch.bfloat16,torch.bfloat16,False,False +256,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False +256,32,6144,False,torch.bfloat16,torch.bfloat16,False,False +256,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False +256,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False +256,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False +256,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False +2,128,6144,False,torch.bfloat16,torch.bfloat16,False,False +2,256,6144,False,torch.bfloat16,torch.bfloat16,False,False +2,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False +2,32,6144,False,torch.bfloat16,torch.bfloat16,False,False +2,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False +2,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False +2,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False +2,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False +32,128,6144,False,torch.bfloat16,torch.bfloat16,False,False +32,256,6144,False,torch.bfloat16,torch.bfloat16,False,False +32,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False +32,32,6144,False,torch.bfloat16,torch.bfloat16,False,False +32,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False +32,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False +32,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False +32,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False +48,128,6144,False,torch.bfloat16,torch.bfloat16,False,False +48,256,6144,False,torch.bfloat16,torch.bfloat16,False,False +48,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False +48,32,6144,False,torch.bfloat16,torch.bfloat16,False,False +48,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False +48,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False +48,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False +48,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False +4,128,6144,False,torch.bfloat16,torch.bfloat16,False,False +4,256,6144,False,torch.bfloat16,torch.bfloat16,False,False +4,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False +4,32,6144,False,torch.bfloat16,torch.bfloat16,False,False +4,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False +4,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False +4,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False +4,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False +64,128,6144,False,torch.bfloat16,torch.bfloat16,False,False +64,256,6144,False,torch.bfloat16,torch.bfloat16,False,False +64,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False +64,32,6144,False,torch.bfloat16,torch.bfloat16,False,False +64,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False +64,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False +64,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False +64,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False +8,128,6144,False,torch.bfloat16,torch.bfloat16,False,False +8,256,6144,False,torch.bfloat16,torch.bfloat16,False,False +8,2624,6144,False,torch.bfloat16,torch.bfloat16,False,False +8,32,6144,False,torch.bfloat16,torch.bfloat16,False,False +8,4096,2048,False,torch.bfloat16,torch.bfloat16,False,False +8,6144,3072,False,torch.bfloat16,torch.bfloat16,False,False +8,6144,4096,False,torch.bfloat16,torch.bfloat16,False,False +8,6144,6144,False,torch.bfloat16,torch.bfloat16,False,False diff --git a/aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv b/aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv index 2498e04776..d6f8b7baad 100644 --- a/aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/gptoss_bf16_tuned_gemm.csv @@ -1,58 +1,58 @@ cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,1,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,4.5547,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0234,0.16,163.19 -256,2,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.5902,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0195,0.32,163.24 -256,4,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.9463,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0117,0.6,153.92 -256,8,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,9,4.6722,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0137,1.26,168.1 -256,16,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,5.0352,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0176,2.34,165.54 -256,32,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,5.0532,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0256,4.67,184.0 -256,48,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,11,4.9391,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0192,7.17,207.74 -256,64,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,13,5.3586,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0211,8.81,209.44 -256,80,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,14,5.6366,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0209,10.46,216.19 -256,96,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,9,5.6999,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0149,12.42,230.67 -256,112,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,9,5.9801,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0172,13.81,235.96 -256,128,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,9,5.5627,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0175,16.97,270.97 -256,256,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,7.3707,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0129,25.61,308.98 -256,1,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.6921,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0117,1.52,1522.53 -256,2,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.7094,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0109,3.04,1520.93 -256,4,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.7085,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0114,6.08,1523.32 -256,8,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440911,0,9.8686,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,11.95,1503.01 -256,16,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,10.0588,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0098,23.46,1483.25 -256,32,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,10.1087,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0099,46.68,1493.15 -256,48,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,10.5885,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0073,66.85,1441.93 -256,64,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,10.6945,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0074,88.24,1443.91 -256,128,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,11.7291,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0074,160.92,1375.91 -256,256,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,15.4587,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0041,244.19,1134.05 -256,1,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440305,0,6.7208,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,1.76,1756.69 -256,1,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440041,0,8.6117,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,2.74,2741.26 -256,2,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439633,0,6.2546,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,3.77,1889.2 -256,2,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439633,0,8.4128,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,5.61,2807.73 -256,4,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439841,0,6.848,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,6.89,1728.37 -256,4,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439841,0,8.4291,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,11.2,2805.61 -256,8,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440305,0,6.4007,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,14.74,1855.32 -256,8,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,9.0233,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,20.92,2627.04 -256,16,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439841,0,6.9819,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,27.03,1712.17 -256,16,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440041,0,9.3274,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,40.47,2553.36 -256,32,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439978,0,7.7251,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB2_NTC1_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,48.87,1567.86 -256,32,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440288,0,10.8176,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2,0.0,69.79,2222.25 -256,48,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440042,0,8.6938,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x48x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA6_NTB1_NTC0_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,65.13,1411.3 -256,48,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439773,0,11.9644,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC3_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,94.65,2027.9 -256,64,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440174,0,9.6016,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,78.63,1294.29 -256,64,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,12.3488,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0136,122.27,1982.86 -256,80,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,13.0497,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0137,144.63,1893.46 -256,96,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,13.4284,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0137,168.67,1856.69 -256,112,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439981,0,15.3173,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB0_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,172.51,1642.3 -256,128,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440893,0,11.3399,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,133.15,1151.51 -256,128,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439981,0,15.1696,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB0_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,199.08,1673.0 -256,256,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439779,0,12.4644,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,242.28,1148.84 -256,1,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,10.3932,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,2.84,2839.09 -256,2,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,10.9392,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,5.39,2698.84 -256,4,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,10.4556,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,11.28,2826.73 -256,8,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,11.0667,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,21.32,2676.43 -256,16,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,10.9761,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,42.99,2710.18 -256,32,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.2181,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0077,77.24,2455.64 -256,48,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,3,12.9274,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0076,109.5,2340.7 -256,64,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,13.3072,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0077,141.84,2293.13 -256,80,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,13.9214,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0076,169.47,2210.35 -256,96,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,14.4736,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0075,195.61,2143.71 -256,112,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439917,0,17.0876,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA1_NTB1_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,193.3,1830.75 -256,128,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441443,0,16.5912,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,227.52,1900.96 +256,1,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.9558,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0234,0.15,149.99 +256,2,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,4.9466,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0195,0.3,151.48 +256,4,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.9687,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0156,0.59,153.23 +256,8,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,14,4.9927,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0176,1.18,157.31 +256,16,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,5.031,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0171,2.34,165.68 +256,32,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,13,4.6354,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0203,5.09,200.59 +256,48,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,14,5.2547,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0212,6.73,195.26 +256,64,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,13,5.3561,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.021,8.81,209.54 +256,80,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,13,5.6419,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0218,10.45,215.98 +256,96,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,9,5.7166,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0163,12.38,230.0 +256,112,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,9,5.9183,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0165,13.95,238.43 +256,128,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,9,5.97,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0172,15.81,252.48 +256,256,128,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,7.0187,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0126,26.89,324.47 +256,1,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.6772,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0105,1.52,1524.87 +256,2,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.8371,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0107,3.0,1501.19 +256,4,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.8551,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0118,5.98,1500.66 +256,8,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.9035,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0101,11.91,1497.72 +256,16,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,10.0897,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.01,23.38,1478.7 +256,32,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,10.2307,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.01,46.12,1475.34 +256,48,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,10.7334,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0073,65.94,1422.46 +256,64,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,10.6753,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0074,88.4,1446.51 +256,128,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,11.6504,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0074,162.01,1385.21 +256,256,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,15.3742,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0041,245.53,1140.28 +256,1,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.0917,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0083,1.3,1298.58 +256,1,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,10.0871,auto,0.0,2.34,2340.31 +256,2,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,9.2607,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0056,2.55,1275.95 +256,2,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,9.6663,auto,0.0,4.88,2443.63 +256,4,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.1797,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0091,5.14,1289.36 +256,4,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,10.1637,auto,0.0,9.29,2326.79 +256,8,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,9.2315,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0092,10.22,1286.39 +256,8,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,11.4653,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0132,16.46,2067.51 +256,16,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,9.3253,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0062,20.24,1281.91 +256,16,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,11.591,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0136,32.57,2054.71 +256,32,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,9.2671,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0065,40.73,1306.98 +256,32,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,11.7821,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0137,64.08,2040.33 +256,48,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,4,9.8145,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.009,57.69,1250.15 +256,48,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,12.1519,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0136,93.19,1996.61 +256,64,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,4,10.1075,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.009,74.69,1229.51 +256,64,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,12.6689,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0137,119.19,1932.76 +256,80,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,13.1447,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0137,143.59,1879.78 +256,96,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,13.5787,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0136,166.8,1836.14 +256,112,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,17.4374,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0048,151.54,1442.62 +256,128,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,11.3635,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0035,132.88,1149.12 +256,128,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,17.3233,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0048,174.33,1465.01 +256,256,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,15.5453,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,194.26,921.15 +256,1,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.5513,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,2.55,2554.45 +256,2,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.4061,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0075,5.17,2588.37 +256,4,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.0536,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0075,9.79,2451.98 +256,8,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.96,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0077,19.73,2476.52 +256,16,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.2044,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0077,38.66,2437.42 +256,32,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.3457,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,76.44,2430.26 +256,48,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,3,3,13.0203,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0076,108.72,2324.0 +256,64,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,13.4435,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0077,140.4,2269.89 +256,80,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,14.2374,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0076,165.71,2161.29 +256,96,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,8,3,14.4747,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0076,195.59,2143.55 +256,112,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,19.1801,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,172.21,1631.02 +256,128,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,19.3026,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,195.56,1633.94 diff --git a/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv b/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv index 81ec422d62..dbf7cda83d 100644 --- a/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv @@ -1,126 +1,126 @@ cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,8,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.8407,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.027,5.62,717.52 -256,16,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.0143,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0309,10.99,717.05 -256,24,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.1591,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0341,16.19,719.14 -256,32,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.1403,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0317,21.64,735.64 -256,40,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.0707,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0302,27.28,756.96 -256,48,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.569,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0293,30.84,727.04 -256,56,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,9.5081,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0316,32.42,667.94 -256,64,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,9.516,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0292,37.02,680.08 -256,72,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,9.0712,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0314,43.69,726.75 -256,80,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.4329,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,46.69,711.69 -256,88,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,14,10.5793,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0287,45.79,646.0 -256,96,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.051,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0228,52.58,691.97 -256,104,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.3236,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0215,55.46,685.4 -256,112,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.7718,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,63.1,736.47 -256,120,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.9195,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0228,66.6,737.69 -256,128,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.7789,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,65.37,690.08 -256,136,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.1736,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0221,67.0,676.52 -256,144,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441425,0,11.3039,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,70.13,679.41 -256,168,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,177,8,13.0017,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,71.13,618.57 -256,176,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,10,12.9091,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0241,75.05,632.37 -256,184,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,10,13.0368,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0241,77.7,635.44 -256,192,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,9,13.1531,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0226,80.36,639.01 -256,200,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,12.8957,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0226,85.38,661.14 -256,208,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,287,8,13.2158,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,86.64,654.27 -256,216,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,7,13.4535,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0201,88.38,651.69 -256,224,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,13.554,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0203,90.98,655.77 -256,232,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.3604,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0219,95.59,674.32 -256,240,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.4529,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,98.21,678.66 -256,248,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.7249,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,99.47,674.02 -256,256,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,13.6667,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0205,103.12,685.73 -256,264,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0371,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.019,103.53,676.24 -256,272,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,13.7697,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0183,108.74,698.15 -256,280,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.2137,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0186,108.45,684.84 -256,288,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440174,0,13.943,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,113.71,706.8 -256,296,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.254,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0186,114.32,699.86 -256,304,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.1592,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0187,118.19,713.08 -256,312,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.8787,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0186,123.76,736.2 -256,320,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.0558,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0184,125.33,735.52 -256,328,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.2261,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0164,126.93,735.21 -256,336,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.6004,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0187,126.69,724.64 -256,344,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439768,0,14.7288,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB0_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,128.57,726.52 -256,352,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.3162,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0188,135.35,755.9 -256,360,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5766,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0163,135.96,750.69 -256,368,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.6534,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,138.25,755.0 -256,376,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5889,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0162,141.88,766.62 -256,384,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.5123,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0184,145.66,778.99 -256,392,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,14.9855,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0185,144.0,762.46 -256,400,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.6799,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,150.0,786.56 -256,408,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.6347,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0161,153.47,797.25 -256,416,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.862,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0164,154.09,793.18 -256,424,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,6,15.3921,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0188,151.64,773.72 -256,432,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.9275,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,159.31,805.89 -256,440,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.957,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0161,161.94,812.38 -256,448,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.266,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,161.55,803.85 -256,456,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.4627,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,162.34,801.44 -256,464,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.3917,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,165.96,812.99 -256,472,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.3552,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,169.22,822.79 -256,480,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.4679,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,170.83,824.61 -256,488,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.5389,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,172.89,828.62 -256,496,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.6968,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,173.95,827.98 -256,504,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.6448,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,177.35,838.45 -256,512,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.667,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,179.91,844.98 -256,1,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.3967,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0283,1.75,1750.26 -256,2,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.0166,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0298,3.66,1835.3 -256,8,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.0761,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0295,14.54,1833.95 -256,16,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.7294,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0303,26.91,1711.71 -256,24,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.8754,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,35.68,1526.35 -256,40,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.3428,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0209,56.77,1482.71 -256,48,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.8915,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0205,59.26,1300.63 -256,56,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.9143,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,75.32,1429.09 -256,64,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440911,0,12.1182,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,77.53,1297.94 -256,72,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.2099,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0214,80.01,1200.59 -256,80,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.2676,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,88.52,1205.25 -256,88,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.6796,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0214,94.44,1178.53 -256,96,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0885,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.018,100.03,1153.63 -256,104,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.1454,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0177,107.93,1158.26 -256,112,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.3991,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,114.19,1146.95 -256,120,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5062,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0159,121.44,1147.52 -256,128,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5238,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0163,129.38,1155.15 -256,136,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.6787,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0163,136.01,1151.89 -256,144,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.633,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,144.46,1164.45 -256,152,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.802,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,150.75,1160.01 -256,160,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.0656,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,155.91,1148.41 -256,168,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.0442,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,163.93,1158.76 -256,176,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.2535,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,169.38,1151.45 -256,184,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.1892,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,177.83,1164.95 -256,192,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.3065,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,184.14,1164.59 -256,200,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.4285,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,190.3,1163.88 -256,208,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.5531,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,196.32,1162.98 -256,216,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.7414,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,201.44,1157.39 -256,224,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,16.0465,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,204.93,1143.56 -256,232,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,16.011,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,212.71,1154.28 -256,240,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.9839,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,220.42,1164.44 -256,248,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.2255,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0165,224.38,1155.18 -256,256,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.5219,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0163,227.46,1142.38 -256,264,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.3885,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0161,236.48,1159.68 -256,272,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.5055,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,241.92,1159.4 -256,280,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.8484,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,243.96,1143.59 -256,288,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.6915,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0165,253.29,1162.19 -256,296,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.5214,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0137,248.0,1114.62 -256,304,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.5882,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0139,253.73,1117.84 -256,312,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439939,0,17.6727,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA5_NTB1_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,259.17,1119.91 -256,320,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.8718,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,262.85,1114.77 -256,328,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.2496,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0137,263.84,1098.87 -256,336,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.1987,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,271.04,1109.15 -256,344,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439801,0,18.2568,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA7_NTB3_NTC2_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,276.61,1112.8 -256,376,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8005,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0138,293.59,1108.5 -256,384,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8832,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0138,298.53,1110.59 -256,4,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.3715,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0286,12.92,3238.74 -256,8,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.7582,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0288,24.82,3118.0 -256,16,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.7144,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0212,45.21,2853.6 -256,24,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,12.8635,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,56.49,2388.39 -256,32,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.0746,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0219,74.1,2361.19 -256,40,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,14.5861,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,83.03,2126.68 -256,48,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.8189,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0181,98.07,2103.29 -256,56,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440034,0,15.2733,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA1_NTB3_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,111.01,2050.44 -256,72,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,7,15.5887,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0197,139.84,2028.0 -256,88,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,6,16.3983,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0179,162.48,1945.99 -256,360,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440054,0,7.306,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,206.67,1028.2 -256,1,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.9485,auto,0.0,1.48,1486.39 -256,48,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439773,0,5.878,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC3_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,59.94,1374.16 -256,72,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,6.5814,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,80.3,1283.31 -256,80,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,6.6122,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,88.81,1295.91 -256,120,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,6.8966,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,127.72,1331.56 -256,128,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,6.8938,auto,0.0,136.29,1349.92 +256,8,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.8287,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0316,5.63,718.62 +256,16,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.9866,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0308,11.03,719.54 +256,24,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.0995,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0318,16.31,724.43 +256,32,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.1003,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0298,21.75,739.28 +256,40,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.1775,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0303,26.93,747.07 +256,48,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.1109,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0225,29.0,683.8 +256,56,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,14,9.3551,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0279,32.95,678.86 +256,64,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.8653,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0312,39.74,730.0 +256,72,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.603,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,41.27,686.51 +256,80,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.2927,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,47.39,722.43 +256,88,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,14,9.9299,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0281,48.79,688.24 +256,96,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,14,10.5219,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0291,50.23,661.0 +256,104,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.7243,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,53.39,659.8 +256,112,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.0639,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,55.73,650.46 +256,120,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.5619,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,62.55,692.82 +256,128,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.0315,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0225,58.57,618.24 +256,136,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.9587,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,62.61,632.11 +256,144,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,9,12.4755,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0224,63.54,615.61 +256,168,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.5098,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0228,73.93,642.9 +256,176,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.823,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,75.56,636.62 +256,184,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.9575,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,78.17,639.33 +256,192,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.1135,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,80.6,640.94 +256,200,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.004,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,84.67,655.63 +256,208,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,12.9955,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0221,88.11,665.36 +256,216,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.5066,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,88.04,649.13 +256,224,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.6487,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,90.35,651.22 +256,232,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.4444,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0221,95.0,670.1 +256,240,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.5075,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,97.81,675.92 +256,248,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6196,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0227,100.24,679.23 +256,256,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,13.8836,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0185,101.51,675.02 +256,264,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0505,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0182,103.44,675.6 +256,272,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.142,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0185,105.88,679.77 +256,280,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0565,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0184,109.66,692.5 +256,288,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.17,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.019,111.89,695.48 +256,296,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0038,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0186,116.36,712.36 +256,304,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.3644,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0187,116.51,702.89 +256,312,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.2119,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0189,120.85,718.94 +256,320,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.0163,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0187,125.68,737.59 +256,328,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.1727,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0188,127.4,737.98 +256,336,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.2772,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0162,129.56,741.04 +256,344,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.3755,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0184,131.73,744.38 +256,352,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.5901,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,132.81,741.71 +256,360,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.4607,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0187,137.05,756.7 +256,368,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.5298,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0186,139.43,761.42 +256,376,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5702,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,142.06,767.6 +256,384,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.6341,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0188,144.45,772.51 +256,392,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.7176,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,146.63,776.34 +256,400,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.7254,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,149.54,784.13 +256,408,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.8111,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0161,151.65,787.75 +256,416,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.8643,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,154.07,793.06 +256,424,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.0387,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,155.21,791.9 +256,432,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.9946,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,158.6,802.29 +256,440,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.9368,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,162.16,813.48 +256,448,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.0012,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,164.4,818.04 +256,456,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.275,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,164.34,811.29 +256,464,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.3249,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0164,166.68,816.53 +256,472,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.4675,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,167.99,816.82 +256,480,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.4382,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,171.16,826.19 +256,488,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.4672,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0164,173.69,832.46 +256,496,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.5009,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,176.15,838.44 +256,504,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.5875,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,178.0,841.54 +256,512,384,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.5896,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.016,180.8,849.17 +256,1,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.4015,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0322,1.75,1749.26 +256,2,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.9899,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0269,3.67,1841.43 +256,8,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.6843,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0272,13.52,1705.51 +256,16,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,8.3784,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0298,28.03,1783.42 +256,24,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.5427,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0212,36.92,1579.56 +256,40,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,9.9028,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0215,59.3,1548.59 +256,48,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.8668,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0211,64.84,1423.28 +256,56,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.662,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0208,77.1,1462.91 +256,64,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,177,8,12.0685,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0211,77.85,1303.28 +256,72,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.1302,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,80.5,1207.88 +256,80,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.4925,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,87.04,1185.16 +256,88,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.6987,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0213,94.3,1176.89 +256,96,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,5,14.0943,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0158,99.99,1153.16 +256,104,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.2129,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.018,107.42,1152.76 +256,112,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,14.2435,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0194,115.43,1159.48 +256,120,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.3959,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0181,122.37,1156.31 +256,128,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.5892,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,128.8,1149.98 +256,136,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.714,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0162,135.69,1149.13 +256,144,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,5,14.569,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0161,145.1,1169.56 +256,152,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,7,14.7479,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0206,151.3,1164.26 +256,160,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,14.9606,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0184,157.0,1156.47 +256,168,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,14.9201,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,165.3,1168.4 +256,176,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.243,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,169.5,1152.24 +256,184,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.2612,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0162,176.99,1159.46 +256,192,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,15.3106,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0163,184.09,1164.28 +256,200,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.415,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,190.46,1164.9 +256,208,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.5644,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,196.18,1162.14 +256,216,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.6173,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0162,203.04,1166.59 +256,224,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.9662,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0163,205.96,1149.31 +256,232,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.8927,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,214.3,1162.87 +256,240,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,5,15.9019,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0161,221.56,1170.44 +256,248,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.1733,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,225.1,1158.9 +256,256,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.642,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0163,225.82,1134.14 +256,264,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.4312,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,235.86,1156.67 +256,272,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.4581,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0161,242.61,1162.74 +256,280,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.5508,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0163,248.35,1164.15 +256,288,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,5,16.6374,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0162,254.12,1165.97 +256,296,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.3769,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,250.06,1123.89 +256,304,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.5554,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,254.21,1119.93 +256,312,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.6566,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0137,259.4,1120.93 +256,320,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,17.8904,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0138,262.58,1113.61 +256,328,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.1946,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0138,264.64,1102.2 +256,336,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.2072,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,270.91,1108.63 +256,344,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.2544,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0138,276.64,1112.95 +256,376,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8057,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,293.51,1108.2 +256,384,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8317,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0139,299.34,1113.63 +256,4,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,8.82,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0279,13.73,3441.26 +256,8,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.9139,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0185,22.19,2787.83 +256,16,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.4057,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0223,46.56,2938.25 +256,24,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,12.8415,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,56.59,2392.48 +256,32,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,13.4509,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,72.03,2295.13 +256,40,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,14.577,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,83.08,2128.01 +256,48,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.7667,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0181,98.42,2110.73 +256,56,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,15.4235,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0181,109.93,2030.47 +256,72,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,15.6998,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0177,138.85,2013.65 +256,88,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,7,16.2598,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.02,163.87,1962.56 +256,360,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,7.4475,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,202.75,1008.67 +256,1,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.3991,auto,0.0,1.36,1362.34 +256,48,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,170,1,6.2363,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,56.5,1295.21 +256,72,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,6.6171,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,79.87,1276.38 +256,80,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,6.5744,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,89.32,1303.36 +256,120,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,6.9203,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,127.28,1327.0 +256,128,7168,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,6.3033,auto,0.0,149.05,1476.39 diff --git a/aiter/configs/model_configs/llama405B_bf16_tuned_gemm.csv b/aiter/configs/model_configs/llama405B_bf16_tuned_gemm.csv index 96e4fc2d08..f1b1e1eb26 100644 --- a/aiter/configs/model_configs/llama405B_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/llama405B_bf16_tuned_gemm.csv @@ -1,157 +1,157 @@ cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,1,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,15.5965,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0343,4.84,4843.06 -256,16,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,17.691,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0337,68.28,4301.37 -256,32,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440300,0,20.6808,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA6_NTB3_NTC1_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,116.82,3708.44 -256,64,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439712,0,22.8339,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,211.61,3411.14 -256,128,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439782,0,27.4051,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,352.62,2929.44 -256,256,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440199,0,35.505,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,544.36,2395.88 -256,512,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440417,0,50.9414,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA1_NTB2_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,758.81,1857.7 -256,1024,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440657,0,82.9033,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,932.53,1372.33 -256,2048,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440214,0,152.5136,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1013.8,996.92 -256,4096,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439533,0,255.1222,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1212.12,896.0 -256,8192,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440614,0,490.0125,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1262.16,778.92 -256,16384,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440195,0,913.2465,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB3_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1354.45,753.21 -256,32768,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440526,0,1647.3102,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1501.78,789.31 -256,1,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,28.5502,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,5.29,5290.22 -256,16,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,32.9676,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0344,73.28,4600.48 -256,32,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441403,0,32.4566,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,148.87,4693.6 -256,64,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441380,0,36.2393,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,266.66,4240.75 -256,128,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440128,0,41.3924,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA2_NTB2_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,466.93,3777.72 -256,256,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441285,0,57.4974,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,672.29,2813.05 -256,512,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,86.7189,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,891.49,1989.08 -256,1024,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440436,0,153.3786,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1008.09,1264.76 -256,2048,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439533,0,244.429,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1265.14,969.52 -256,4096,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440625,0,477.1746,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB2_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1296.12,676.82 -256,8192,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440614,0,908.5186,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1361.5,544.76 -256,16384,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440520,0,1679.2573,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1473.21,499.54 -256,32768,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440620,0,3238.9311,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1527.6,471.37 -256,1,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440308,0,50.3359,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,6.0,6000.51 -256,16,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440308,0,50.8541,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,95.01,5954.47 -256,32,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439915,0,53.3772,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA5_NTB2_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,181.05,5688.35 -256,64,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440176,0,58.9864,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,327.66,5175.2 -256,128,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440779,0,68.3547,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,565.5,4513.86 -256,256,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440842,0,94.4602,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT144x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,818.43,3335.77 -256,512,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440842,0,152.756,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT144x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1012.19,2148.55 -256,1024,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439533,0,262.6958,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1177.17,1349.16 -256,2048,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440282,0,494.0612,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1251.82,823.48 -256,4096,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,937.4531,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1319.48,545.85 -256,8192,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,1638.2233,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1510.11,440.37 -256,16384,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440534,0,3231.1473,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1531.28,353.08 -256,32768,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440454,0,6501.6217,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1522.02,304.5 -256,1,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,14.4737,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,4.64,4639.15 -256,1,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,26.6447,auto,0.0,5.04,5038.85 -256,1,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,43.1686,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.22,6219.44 -256,16,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,13.9792,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,76.81,4842.82 -256,16,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439843,0,27.698,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB2_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,77.53,4869.42 -256,16,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440554,0,43.2882,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC2_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,99.22,6219.29 -256,32,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,14.6606,auto,0.0,146.48,4657.96 -256,32,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,25.4147,auto,0.0,169.0,5332.68 -256,32,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,47.9929,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,178.98,5626.01 -256,64,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441387,0,17.5039,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,245.37,3968.72 -256,64,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439385,0,28.853,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC2_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,297.71,4742.63 -256,64,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440890,0,50.3817,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,340.99,5390.47 -256,128,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439853,0,19.9532,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB0_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,430.5,3599.8 -256,128,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439564,0,40.9366,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA3_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,419.67,3406.75 -256,128,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440858,0,60.7737,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,565.37,4520.49 -256,256,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440209,0,24.2433,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,708.64,3157.41 -256,256,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440456,0,42.8134,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,802.55,3379.86 -256,256,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,78.48,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,875.63,3580.76 -256,512,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440203,0,34.9957,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,981.83,2456.97 -256,512,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441313,0,67.55,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1017.31,2297.4 -256,512,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441313,0,126.5963,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1085.65,2319.19 -256,1024,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440452,0,54.7768,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1254.54,1914.27 -256,1024,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440548,0,107.5718,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1277.65,1637.61 -256,1024,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,193.6001,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1419.82,1646.52 -256,2048,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,115.7963,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1186.9,1231.53 -256,2048,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440491,0,199.3694,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1378.74,1093.97 -256,2048,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440425,0,378.0526,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1454.18,976.32 -256,4096,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440528,0,212.4243,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1294.0,1026.74 -256,4096,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440575,0,374.6172,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1467.51,806.13 -256,4096,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,729.5152,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1507.18,643.94 -256,8192,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440578,0,390.9707,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1406.13,944.06 -256,8192,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,741.8084,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1482.2,633.27 -256,8192,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440483,0,1412.7149,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1556.59,475.03 -256,16384,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440507,0,785.7076,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1399.39,854.12 -256,16384,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440490,0,1462.6083,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1503.49,550.6 -256,16384,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439464,0,2787.2101,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1577.94,385.24 -256,32768,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,1524.068,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1442.86,836.62 -256,32768,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441460,0,2882.9014,Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950,0.0,1525.56,512.12 -256,32768,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440453,0,6064.912,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1450.32,309.82 -256,1,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,68.6404,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.35,6355.83 -256,16,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440291,0,70.933,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,98.39,6162.97 -256,32,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440891,0,71.8762,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,194.2,6095.32 -256,64,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440855,0,80.5537,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,346.57,5462.3 -256,128,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440551,0,95.4707,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB2_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,584.83,4648.65 -256,256,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440226,0,129.7321,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB2_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,860.77,3479.57 -256,512,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440398,0,205.6306,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1086.11,2269.2 -256,1024,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,334.1007,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1336.95,1487.65 -256,2048,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440534,0,640.1636,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1395.51,871.41 -256,4096,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,1277.3354,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1398.78,531.95 -256,8192,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,2356.24,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1516.57,391.62 -256,16384,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,4603.5971,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1552.44,306.13 -256,32768,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,9181.1834,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1556.84,259.48 -256,1,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,35.2163,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.19,6194.57 -256,1,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,64.8462,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.73,6727.72 -256,1,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440146,0,129.1826,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,6.75,6754.02 -256,16,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440907,0,35.6376,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,97.92,6140.74 -256,16,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440291,0,65.4229,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,106.68,6682.03 -256,16,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439876,0,130.1849,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,107.22,6711.93 -256,32,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439637,0,37.104,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,188.1,5917.92 -256,32,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440891,0,67.9373,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,205.46,6448.71 -256,32,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,131.496,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,212.31,6655.47 -256,64,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,40.4189,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,345.35,5469.05 -256,64,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,71.7118,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,389.3,6135.79 -256,64,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,135.5425,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,411.93,6477.08 -256,128,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440068,0,47.8218,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA6_NTB2_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,583.78,4684.1 -256,128,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440209,0,87.6494,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,637.02,5063.47 -256,128,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,157.008,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,711.23,5626.63 -256,256,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,67.7717,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,823.86,3392.28 -256,256,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439559,0,118.6429,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,941.22,3804.8 -256,256,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440767,0,210.7823,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1059.57,4243.41 -256,512,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440203,0,103.243,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1081.61,2341.05 -256,512,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440584,0,192.4251,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1160.65,2424.92 -256,512,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440226,0,344.0903,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB2_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1298.14,2663.42 -256,1024,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,160.8002,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1388.92,1649.81 -256,1024,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,302.0495,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1478.82,1645.51 -256,1024,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,589.1786,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1516.27,1630.23 -256,2048,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,307.7937,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1451.22,1015.21 -256,2048,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,584.2325,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1529.11,954.83 -256,2048,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,1141.2861,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1565.52,918.77 -256,4096,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440508,0,587.1169,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1521.59,692.96 -256,4096,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440508,0,1130.843,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1579.98,600.86 -256,4096,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,2253.5103,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1585.71,543.48 -256,8192,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,1140.9894,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1565.93,522.0 -256,8192,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,2243.6419,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1592.68,411.27 -256,8192,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,4521.6701,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1580.57,348.78 -256,16384,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439467,0,2278.6755,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC3_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1568.2,427.04 -256,16384,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,4505.5017,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1586.24,312.79 -256,16384,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,9044.7089,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1580.33,252.27 -256,32768,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439467,0,4536.6066,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC3_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1575.37,380.91 -256,32768,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,9028.2301,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1583.22,263.88 -256,32768,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,18162.3989,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1573.98,203.22 -256,1,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440308,0,126.776,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,6.88,6882.23 -256,16,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440308,0,128.632,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,108.52,6792.96 -256,32,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439915,0,135.9529,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA5_NTB2_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,205.35,6437.29 -256,64,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,141.6186,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,394.26,6199.19 -256,128,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,163.1593,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,684.42,5414.5 -256,256,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439557,0,226.6243,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA0_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,985.5,3946.78 -256,512,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,349.7922,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1276.98,2620.0 -256,1024,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,655.5621,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1362.73,1465.15 -256,2048,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1315.3294,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1358.37,797.2 -256,4096,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,2340.3623,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1526.86,523.31 -256,8192,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,4560.5649,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1567.09,345.8 -256,16384,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,9129.8796,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1565.59,249.92 -256,32768,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,18289.8411,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1563.02,201.81 -256,1,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440308,0,251.18,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,6.95,6947.09 -256,16,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440308,0,251.5826,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,110.97,6944.27 -256,32,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,258.6972,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,215.83,6761.91 -256,64,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440771,0,281.3671,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,396.88,6232.94 -256,128,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440779,0,286.8183,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,778.68,6145.55 -256,256,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440767,0,391.3087,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1141.49,4550.07 -256,512,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,695.1809,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1285.07,2612.46 -256,1024,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1321.0685,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1352.47,1428.72 -256,2048,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,2386.3828,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1497.42,850.68 -256,4096,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,4533.2258,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1576.54,510.73 -256,8192,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,9070.8882,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1575.77,318.13 -256,16384,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,18214.2616,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1569.5,221.06 -256,32768,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,36485.4726,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1567.05,172.9 +256,1,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,15.8275,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0321,4.77,4772.38 +256,16,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,17.7837,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0354,67.93,4278.95 +256,32,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,21.104,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0337,114.48,3634.07 +256,64,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,7,23.9613,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0226,201.65,3250.64 +256,128,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,35.5248,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.013,272.03,2259.87 +256,256,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,371,4,55.1808,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0161,350.26,1541.58 +256,512,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,98.3036,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,393.22,962.67 +256,1024,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,145.2088,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,532.4,783.5 +256,2048,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,238.9957,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,646.95,636.18 +256,4096,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,327.4554,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,944.37,698.08 +256,8192,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,677.7405,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,912.55,563.17 +256,16384,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1109.5955,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1114.78,619.92 +256,32768,2304,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1981.8193,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1248.3,656.08 +256,1,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,29.5571,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0354,5.11,5110.0 +256,16,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,33.1405,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0335,72.9,4576.48 +256,32,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,38.2897,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0158,126.19,3978.57 +256,64,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,42.02,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0131,229.98,3657.35 +256,128,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,84,8,66.3889,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0244,291.12,2355.35 +256,256,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,331,4,100.3019,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0161,385.38,1612.56 +256,512,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,145.7193,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,530.54,1183.72 +256,1024,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,251.3842,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,615.07,771.67 +256,2048,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,298.7333,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1035.16,793.28 +256,4096,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,598.5374,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1033.31,539.58 +256,8192,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,953.3962,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1297.42,519.12 +256,16384,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1739.3206,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1422.34,482.29 +256,32768,4608,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,3408.2792,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1451.7,447.95 +256,1,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,195,16,65.2799,flydsl_gemm2_abf16_wbf16_bf16_t48x128x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.031,4.63,4626.86 +256,16,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,68.6305,auto,0.0,70.4,4412.16 +256,32,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,73.2489,auto,0.0,131.93,4145.16 +256,64,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,241,4,84.768,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0159,228.0,3601.2 +256,128,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,26,4,110.2749,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0159,350.53,2797.95 +256,256,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,153.5055,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,503.63,2052.68 +256,512,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,262.2584,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,589.57,1251.45 +256,1024,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,286.2339,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1080.37,1238.21 +256,2048,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,597.7434,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1034.68,680.64 +256,4096,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,944.2369,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1310.0,541.92 +256,8192,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1710.6163,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1446.2,421.73 +256,16384,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,3321.8968,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1489.45,343.43 +256,32768,9216,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,6749.5084,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1466.12,293.31 +256,1,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,195,16,82.8507,flydsl_gemm2_abf16_wbf16_bf16_t48x128x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0333,5.26,5265.7 +256,16,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,78.0998,auto,0.0,89.36,5597.43 +256,32,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,81.5625,auto,0.0,171.14,5371.44 +256,64,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,221,2,109.5182,flydsl_gemm2_abf16_wbf16_bf16_t64x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0074,254.91,4017.68 +256,128,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,6,2,135.6846,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0075,411.5,3270.89 +256,256,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,203.5247,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,548.68,2217.97 +256,512,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,281.769,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,792.63,1656.02 +256,1024,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,335.4867,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1331.43,1481.5 +256,2048,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,643.543,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1388.18,866.83 +256,4096,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1298.0192,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1376.49,523.47 +256,8192,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2452.1446,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1457.26,376.3 +256,16384,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4823.026,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1481.81,292.2 +256,32768,13312,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9990.3785,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.74,238.47 +256,1,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,89,2,14.8968,flydsl_gemm2_abf16_wbf16_bf16_t16x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0028,4.5,4507.39 +256,1,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,25.8661,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0123,5.19,5190.53 +256,1,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,38.7993,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.014,5.62,5622.52 +256,1,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,50.4794,auto,0.0,5.32,5318.7 +256,1,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,73.9631,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0237,5.9,5898.44 +256,1,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,131,4,144.3838,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.019,6.04,6042.93 +256,16,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,14.4467,auto,0.0,74.32,4686.1 +256,16,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.7443,auto,0.0,90.44,5680.23 +256,16,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,37.3119,auto,0.0,93.53,5865.18 +256,16,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,44.7779,auto,0.0,95.92,6012.38 +256,16,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,70.9555,auto,0.0,98.36,6161.01 +256,16,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,146.0885,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0175,95.55,5981.25 +256,32,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,14.6697,auto,0.0,146.39,4655.07 +256,32,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,24.9617,auto,0.0,172.06,5429.46 +256,32,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,39.4472,auto,0.0,176.93,5566.39 +256,32,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,47.1387,auto,0.0,182.23,5727.95 +256,32,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.4473,auto,0.0,185.01,5806.81 +256,32,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,152.3247,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0173,183.27,5745.41 +256,64,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,18.9896,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,226.17,3658.22 +256,64,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,33.4354,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,256.91,4092.64 +256,64,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,50.5166,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,276.32,4375.85 +256,64,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,66.9741,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,256.52,4055.02 +256,64,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,94.2083,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,296.34,4670.59 +256,64,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,241,4,172.4583,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0172,323.76,5090.62 +256,128,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.3261,auto,0.0,368.25,3079.27 +256,128,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,42.8585,auto,0.0,400.85,3253.98 +256,128,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,69.0302,auto,0.0,404.42,3244.99 +256,128,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,81.6344,auto,0.0,420.9,3365.33 +256,128,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,6,2,123.6218,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0071,451.66,3590.06 +256,128,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,6,2,218.6242,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0082,510.78,4040.84 +256,256,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,32.8485,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,523.0,2330.28 +256,256,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,61.6719,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,557.14,2346.34 +256,256,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,94.7682,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,589.17,2425.92 +256,256,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,114.8498,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,598.34,2446.83 +256,256,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,179.7454,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,621.26,2511.4 +256,256,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,349.3089,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,639.37,2560.59 +256,512,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,47.2723,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,726.85,1818.89 +256,512,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,83.3091,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,824.87,1862.81 +256,512,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,127.3264,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,877.03,1898.25 +256,512,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,155.7206,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,882.6,1885.44 +256,512,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,235.6216,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,947.87,1980.36 +256,512,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,459.4132,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,972.28,1994.84 +256,1024,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,60.4762,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1136.31,1733.87 +256,1024,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,107.397,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1279.73,1640.28 +256,1024,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,164.2949,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1359.37,1614.72 +256,1024,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,197.3416,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1392.9,1615.31 +256,1024,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,312.555,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1429.11,1590.2 +256,1024,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,596.8639,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1496.75,1609.24 +256,2048,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,115.295,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1192.06,1236.88 +256,2048,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,201.0475,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1367.23,1084.84 +256,2048,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,312.2677,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.43,1000.67 +256,2048,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,370.4202,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1484.14,996.43 +256,2048,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,588.2507,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1518.66,948.31 +256,2048,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1158.5257,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1542.22,905.1 +256,4096,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,209.7933,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1310.23,1039.61 +256,4096,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,384.44,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.02,785.53 +256,4096,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,594.3976,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1502.96,684.47 +256,4096,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,715.0195,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1537.74,656.99 +256,4096,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1180.0684,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1514.07,575.79 +256,4096,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2315.9959,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1542.93,528.82 +256,8192,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,402.4127,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1366.15,917.21 +256,8192,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,735.8522,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1494.2,638.39 +256,8192,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1172.104,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1524.36,508.14 +256,8192,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1418.3458,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1550.41,473.15 +256,8192,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2350.1553,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1520.5,392.63 +256,8192,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4679.1879,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1527.36,337.04 +256,16384,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,791.7086,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1388.78,847.65 +256,16384,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1468.7844,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1497.17,548.28 +256,16384,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2338.6747,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1527.96,416.08 +256,16384,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2840.907,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1548.11,377.96 +256,16384,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4751.9617,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1503.97,296.57 +256,16384,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9811.8509,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1456.77,232.55 +256,32768,16384,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1567.4338,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1402.94,813.48 +256,32768,16384,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2904.4945,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1514.22,508.31 +256,32768,16384,6656,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4677.1636,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1528.03,369.47 +256,32768,16384,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5762.8133,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1526.35,326.06 +256,32768,16384,13312,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9939.0603,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1438.13,239.7 +256,32768,16384,26624,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,20923.7314,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1366.26,176.4 +256,1,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,137,16,161.3276,flydsl_gemm2_abf16_wbf16_bf16_t32x256x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0338,5.41,5408.26 +256,16,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,150.2708,auto,0.0,92.89,5814.78 +256,32,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,159.5509,auto,0.0,174.97,5485.19 +256,64,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,253,2,184.9658,flydsl_gemm2_abf16_wbf16_bf16_t64x256x128_split_k2_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0074,301.86,4746.39 +256,128,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,224.7686,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,496.82,3930.38 +256,256,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,20,1,314.5993,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,709.91,2843.09 +256,512,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,354.3556,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1260.53,2586.26 +256,1024,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,656.6545,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1360.46,1462.71 +256,2048,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1299.8911,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1374.5,806.66 +256,4096,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2451.9553,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1457.37,499.49 +256,8192,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4861.9861,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1469.94,324.37 +256,16384,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9935.8309,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1438.6,229.64 +256,32768,26624,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,20992.3261,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1361.8,175.83 +256,1,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,300.7058,auto,0.0,5.8,5802.91 +256,16,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,291.4735,auto,0.0,95.78,5993.89 +256,32,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,310.0882,auto,0.0,180.06,5641.26 +256,64,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,250,1,321.9447,flydsl_gemm2_abf16_wbf16_bf16_t64x256x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,346.86,5447.34 +256,128,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,362.9834,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,615.29,4856.02 +256,256,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,404.1413,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1105.25,4405.59 +256,512,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,692.1352,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1290.72,2623.96 +256,1024,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1298.7276,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1375.74,1453.3 +256,2048,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2445.8219,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1461.03,830.0 +256,4096,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4796.6589,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1489.96,482.68 +256,8192,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,9742.2812,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1467.18,296.2 +256,16384,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,20953.6725,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1364.31,192.16 +256,32768,53248,16384,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,39969.08,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.47,157.83 diff --git a/aiter/configs/model_configs/llama70B_bf16_tuned_gemm.csv b/aiter/configs/model_configs/llama70B_bf16_tuned_gemm.csv index fe2c088d96..6c50c6bf1e 100644 --- a/aiter/configs/model_configs/llama70B_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/llama70B_bf16_tuned_gemm.csv @@ -1,157 +1,157 @@ cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,64,192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,3.7986,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0124,6.63,144.49 -256,1,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,7.3327,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0305,2.86,2862.58 -256,16,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,105,16,8.7513,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0287,38.34,2431.02 -256,32,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,177,8,12.0725,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0214,55.59,1787.35 -256,64,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6099,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0216,98.62,1629.99 -256,128,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,15.0191,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.018,178.73,1557.77 -256,256,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439930,0,17.7489,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA3_NTB1_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,302.48,1454.8 -256,512,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440320,0,23.6725,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_12_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,453.58,1295.63 -256,1024,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440366,0,33.9749,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA3_NTB7_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,632.08,1188.24 -256,2048,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441305,0,52.6186,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,816.25,1135.89 -256,4096,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440465,0,89.5077,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,959.69,1101.2 -256,8192,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440465,0,147.843,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1162.03,1191.54 -256,16384,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440417,0,267.8494,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA1_NTB2_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1282.8,1237.08 -256,32768,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440527,0,490.6775,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1400.5,1307.84 -256,1,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,10.3178,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0312,4.07,4067.2 -256,16,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,93,8,12.6898,flydsl_gemm2_abf16_wbf16_bf16_t16x128x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0214,52.88,3332.37 -256,32,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6964,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0221,97.99,3112.58 -256,64,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441380,0,15.5877,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,172.21,2779.07 -256,128,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439851,0,18.0634,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA3_NTB1_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,297.21,2474.37 -256,256,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441367,0,24.629,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,435.97,1926.51 -256,512,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,34.5297,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,621.92,1533.55 -256,1024,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440417,0,53.207,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA1_NTB2_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,807.22,1202.16 -256,2048,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439562,0,91.6693,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,937.06,937.97 -256,4096,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441305,0,138.3141,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1242.09,940.06 -256,8192,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440452,0,268.4801,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1279.79,812.36 -256,16384,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440606,0,484.6137,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB0_NTC7_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1418.03,813.56 -256,32768,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,909.8416,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1510.58,820.57 -256,1,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,103,8,6.3379,flydsl_gemm2_abf16_wbf16_bf16_t16x256x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0112,2.65,2650.03 -256,1,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440304,0,8.761,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC2_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,3.83,3832.32 -256,1,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440286,0,23.1091,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB2_NTC3_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,5.81,5809.42 -256,16,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439772,0,6.5729,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,40.84,2597.35 -256,16,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439977,0,9.1174,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,58.88,3716.2 -256,16,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440553,0,24.1955,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,88.76,5568.89 -256,32,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.1053,auto,0.0,75.56,2444.24 -256,32,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440174,0,9.8977,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,108.48,3456.34 -256,32,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439842,0,27.1079,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,158.44,4989.92 -256,64,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441407,0,12.0362,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,178.42,2896.69 -256,64,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439986,0,30.9767,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,277.3,4400.56 -256,128,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440262,0,9.1681,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,234.23,2087.29 -256,128,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441404,0,13.5214,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,317.64,2675.45 -256,128,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439646,0,37.1858,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,462.0,3722.17 -256,256,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440196,0,11.4458,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA4_NTB2_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,375.24,1878.05 -256,256,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440054,0,17.9759,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,477.86,2158.3 -256,256,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440197,0,47.9098,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA5_NTB0_NTC6_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,717.18,2976.56 -256,512,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439871,0,15.129,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,567.78,1732.73 -256,512,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440209,0,22.6168,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,759.61,1947.23 -256,512,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440298,0,75.6048,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB3_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,908.93,1997.16 -256,1024,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440203,0,20.4515,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,840.03,1743.23 -256,1024,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440412,0,34.7945,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,987.5,1567.09 -256,1024,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440491,0,121.6111,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1130.15,1379.58 -256,2048,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440620,0,31.4274,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1093.31,1734.98 -256,2048,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440543,0,54.003,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1272.51,1398.02 -256,2048,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440503,0,189.4633,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1450.82,1062.62 -256,4096,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,61.2463,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1122.02,1506.62 -256,4096,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440335,0,104.9472,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1309.6,1119.04 -256,4096,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440333,0,363.0173,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC5_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1514.41,739.46 -256,8192,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,122.5229,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1121.74,1369.31 -256,8192,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440543,0,204.6234,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1343.34,983.89 -256,8192,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,710.6239,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1547.25,566.62 -256,16384,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,232.4281,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1182.64,1371.47 -256,16384,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440544,0,395.3056,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB0_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1390.71,933.7 -256,16384,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440502,0,1407.8259,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1562.0,476.68 -256,32768,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440528,0,453.3644,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1212.61,1369.22 -256,32768,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,779.1373,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1411.19,904.39 -256,32768,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,3026.2195,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1453.31,399.16 -256,1,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,29.3707,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,5.71,5713.48 -256,16,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,30.8036,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,87.14,5465.66 -256,32,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,31.1015,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,172.62,5432.27 -256,64,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439915,0,34.2047,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA5_NTB2_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,313.92,4973.92 -256,128,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440299,0,40.4695,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,530.64,4262.24 -256,256,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440791,0,57.6827,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,744.58,3072.14 -256,512,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439562,0,92.2564,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,931.09,2023.13 -256,1024,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441305,0,149.9052,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1146.05,1371.01 -256,2048,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440465,0,293.505,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1170.67,828.84 -256,4096,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,475.9574,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1443.82,669.74 -256,8192,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440450,0,899.5932,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1527.79,522.19 -256,16384,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,1791.5084,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1534.34,430.78 -256,32768,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440534,0,3495.0812,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1572.94,393.62 -256,1,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440303,0,20.5725,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,5.71,5710.11 -256,16,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,22.3125,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,84.22,5285.47 -256,32,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441408,0,25.9201,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,144.99,4568.79 -256,64,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439593,0,29.8084,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,252.15,4005.8 -256,128,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439908,0,35.1308,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,427.9,3454.88 -256,256,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440341,0,45.2671,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,664.16,2768.12 -256,512,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440657,0,71.5815,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,840.02,1860.39 -256,1024,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440203,0,116.622,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1031.19,1276.76 -256,2048,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,180.181,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1334.87,1000.97 -256,4096,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,339.9066,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1415.2,715.7 -256,8192,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440518,0,648.7957,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC5_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1482.86,568.9 -256,16384,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441281,0,1222.0475,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1574.53,507.96 -256,32768,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440524,0,2436.0256,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1579.74,461.44 -256,1,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439977,0,12.1146,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,4.85,4849.01 -256,1,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440286,0,20.3313,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB2_NTC3_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,5.78,5777.85 -256,1,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440305,0,72.551,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,6.47,6475.94 -256,16,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,12.6982,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,73.99,4653.97 -256,16,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440553,0,21.2748,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,88.32,5543.27 -256,16,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440306,0,73.9187,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2,0.0,101.68,6371.08 -256,32,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440111,0,13.6521,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,137.64,4356.39 -256,32,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440174,0,24.2667,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,154.87,4880.08 -256,32,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,76.3662,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,196.85,6182.33 -256,64,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441407,0,16.7778,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,223.99,3589.72 -256,64,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439593,0,27.857,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,269.81,4286.41 -256,64,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,82.9978,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,362.24,5716.79 -256,128,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439919,0,20.4535,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,367.48,3018.31 -256,128,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440108,0,33.0489,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,454.85,3672.52 -256,128,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,93.8224,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,640.89,5107.51 -256,256,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440341,0,26.5094,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,567.06,2442.51 -256,256,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440341,0,43.8206,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,686.09,2859.5 -256,256,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439559,0,128.3281,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,937.12,3807.71 -256,512,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440209,0,35.1572,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,855.15,2013.21 -256,512,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440459,0,66.7104,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,901.35,1996.23 -256,512,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440453,0,208.9587,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1151.03,2428.76 -256,1024,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439453,0,56.7205,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB1_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1060.1,1460.45 -256,1024,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440483,0,113.4358,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1060.15,1312.62 -256,1024,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440440,0,353.5274,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1360.68,1542.34 -256,2048,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439467,0,92.7091,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC3_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1297.17,1153.66 -256,2048,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,172.3823,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1395.26,1046.25 -256,2048,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440505,0,618.4741,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC4_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1555.56,1003.69 -256,4096,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,177.0024,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1358.84,876.76 -256,4096,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,324.9569,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1480.31,748.62 -256,4096,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,1216.919,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1581.16,634.19 -256,8192,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,333.1274,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1444.0,755.44 -256,8192,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441281,0,627.6619,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1532.79,588.05 -256,8192,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,2437.242,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1578.95,440.56 -256,16384,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,648.9517,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1482.5,685.1 -256,16384,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440334,0,1222.429,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1574.03,507.81 -256,16384,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439464,0,4914.4576,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1566.11,341.38 -256,32768,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439467,0,1264.7815,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC3_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1521.33,656.61 -256,32768,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,2428.4901,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1584.64,462.87 -256,32768,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439464,0,10067.9005,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1528.93,286.62 -256,1,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,38.458,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.11,6108.64 -256,16,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440628,0,39.1379,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,96.02,6019.79 -256,32,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440891,0,40.6542,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,184.88,5813.0 -256,64,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440552,0,45.351,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA5_NTB3_NTC5_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,331.47,5242.76 -256,128,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440456,0,52.2152,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,575.79,4608.78 -256,256,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440799,0,75.2632,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,798.92,3274.05 -256,512,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440799,0,119.4186,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1007.04,2160.05 -256,1024,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,181.075,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1328.28,1551.94 -256,2048,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,345.903,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1390.67,945.8 -256,4096,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440483,0,653.8876,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1471.31,641.44 -256,8192,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440508,0,1221.8425,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1574.79,494.32 -256,16384,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440624,0,2425.2544,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1586.76,401.23 -256,32768,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,4832.0372,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1592.82,354.15 -256,1,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440894,0,139.7265,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,6.72,6724.96 -256,16,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440894,0,138.785,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,108.31,6784.75 -256,32,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440894,0,150.5846,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,199.65,6267.03 -256,64,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440848,0,154.0302,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,390.38,6154.07 -256,128,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440564,0,170.6136,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA6_NTB2_NTC2_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,704.86,5605.07 -256,256,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,214.4058,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1121.79,4538.49 -256,512,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440226,0,377.8892,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB2_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1272.96,2663.83 -256,1024,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,693.132,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1388.01,1549.12 -256,2048,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,1246.7409,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1543.34,968.89 -256,4096,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,2455.0899,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1567.47,601.36 -256,8192,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439464,0,4823.8046,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1595.54,417.36 -256,16384,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,9740.0474,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1580.4,316.94 -256,32768,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440397,0,20959.1243,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1468.87,249.75 +256,64,192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,289,8,4.3722,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0126,5.76,125.53 +256,1,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,7.365,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0305,2.85,2850.03 +256,16,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,165,16,8.9131,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0292,37.65,2386.89 +256,32,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,277,16,10.9667,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0306,61.19,1967.57 +256,64,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6294,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,98.48,1627.65 +256,128,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,15.0329,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0179,178.57,1556.34 +256,256,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,18.8414,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0141,284.94,1370.45 +256,512,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,29.8384,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0061,359.85,1027.9 +256,1024,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,52.4555,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,409.39,769.61 +256,2048,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,76.0093,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,565.06,786.34 +256,4096,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,75,1,125.2097,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,686.04,787.21 +256,8192,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,170.9303,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1005.08,1030.6 +256,16384,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,319.2235,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1076.35,1037.99 +256,32768,1280,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,539.6049,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1273.51,1189.26 +256,1,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,167,16,9.9782,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.034,4.2,4205.62 +256,16,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,11.7557,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0208,57.09,3597.16 +256,32,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.7726,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0223,97.45,3095.36 +256,64,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,16.2682,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0185,165.01,2662.82 +256,128,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,21.6899,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0107,247.52,2060.66 +256,256,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,31.7649,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0061,338.03,1493.73 +256,512,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,53.2462,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,403.31,994.5 +256,1024,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,78.795,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,545.08,811.77 +256,2048,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,120.1702,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,714.81,715.51 +256,4096,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,163.7211,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1049.34,794.18 +256,8192,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,312.7178,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1098.75,697.45 +256,16384,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,515.5546,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1332.92,764.74 +256,32768,2560,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,965.4849,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1423.52,773.28 +256,1,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,22.8083,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0232,5.15,5150.37 +256,16,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,25.8218,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0208,72.77,4567.15 +256,32,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,29.7832,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0139,126.18,3976.19 +256,64,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,34.0572,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0062,220.69,3506.06 +256,128,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,50.1861,auto,0.0,299.53,2418.45 +256,256,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,72.0475,auto,0.0,417.29,1739.2 +256,512,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,109.5379,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,548.94,1215.74 +256,1024,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,145.6016,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,825.95,1022.64 +256,2048,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,180.6066,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1331.72,998.61 +256,4096,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,343.387,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1400.86,708.44 +256,8192,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,670.3004,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1435.29,550.65 +256,16384,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1263.3316,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1523.07,491.37 +256,32768,7168,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2526.7126,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1523.04,444.88 +256,1,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,103,8,6.882,flydsl_gemm2_abf16_wbf16_bf16_t16x256x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0099,2.44,2440.52 +256,1,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,134,8,8.7573,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0146,3.83,3833.93 +256,1,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,12.7476,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0219,4.61,4608.22 +256,1,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,22.184,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0198,5.29,5295.31 +256,1,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,103,8,26.5479,flydsl_gemm2_abf16_wbf16_bf16_t16x256x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0219,5.06,5056.92 +256,1,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,78.0769,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0173,6.02,6017.6 +256,16,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.3003,auto,0.0,36.77,2338.55 +256,16,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.2334,auto,0.0,47.79,3016.19 +256,16,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,14.7954,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0115,63.5,3994.29 +256,16,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,23.935,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0131,78.51,4927.18 +256,16,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,203,8,28.5135,flydsl_gemm2_abf16_wbf16_bf16_t48x128x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,75.31,4725.55 +256,16,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,79.2288,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0175,94.87,5944.07 +256,32,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.079,auto,0.0,75.84,2453.32 +256,32,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.4,auto,0.0,94.19,3000.86 +256,32,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,17.3213,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0048,108.48,3433.57 +256,32,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,27.5303,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0135,136.51,4301.57 +256,32,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,203,8,33.8365,flydsl_gemm2_abf16_wbf16_bf16_t48x128x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,126.93,3997.64 +256,32,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,82.9324,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0168,181.26,5692.85 +256,64,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.4356,auto,0.0,172.69,2803.66 +256,64,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,18.6886,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0048,201.09,3222.69 +256,64,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,30.8218,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0061,243.86,3874.1 +256,64,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,36.7821,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0062,233.54,3706.01 +256,64,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,97.8785,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0172,307.16,4847.65 +256,128,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,9.5917,auto,0.0,223.89,1995.11 +256,128,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,16.0441,auto,0.0,267.7,2254.78 +256,128,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,25.3581,auto,0.0,296.4,2434.52 +256,128,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,46.4994,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,323.28,2610.2 +256,128,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,51.0973,auto,0.0,336.22,2708.79 +256,128,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,9,4,131.5455,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.017,457.1,3642.84 +256,256,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,280,1,12.7252,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,337.52,1689.23 +256,256,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.9273,auto,0.0,391.75,1769.36 +256,256,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,35.5211,auto,0.0,423.2,1822.85 +256,256,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,67.4176,auto,0.0,445.95,1858.64 +256,256,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.099,auto,0.0,457.53,1898.91 +256,256,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,26,4,207.8212,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0172,578.67,2351.23 +256,512,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,18.1085,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,474.36,1447.63 +256,512,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,31.4793,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,545.75,1399.02 +256,512,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,53.4897,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,562.07,1323.22 +256,512,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,98.4178,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,610.96,1353.1 +256,512,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,122.3437,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,561.69,1234.19 +256,512,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,366.6769,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,655.94,1384.08 +256,1024,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,20,1,25.3837,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,676.81,1404.51 +256,1024,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,46.4383,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,739.9,1174.16 +256,1024,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,72.7344,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,826.7,1138.9 +256,1024,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,132.1089,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,910.3,1127.08 +256,1024,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,149.1713,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,921.35,1124.69 +256,1024,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,482.3198,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,997.34,1130.49 +256,2048,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,32.55,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1055.6,1675.14 +256,2048,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,60.4033,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1137.68,1249.89 +256,2048,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,95.4106,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1260.44,1120.99 +256,2048,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,171.7077,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1400.74,1050.36 +256,2048,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,192.9439,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1424.65,1043.45 +256,2048,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,627.237,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1533.83,989.67 +256,4096,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,65.1781,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1054.33,1415.73 +256,4096,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,113.0718,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1215.5,1038.64 +256,4096,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,177.8346,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1352.48,872.66 +256,4096,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,326.7272,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1472.29,744.56 +256,4096,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,366.8778,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1498.47,731.68 +256,4096,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1246.8325,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1543.23,618.97 +256,8192,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,121.9167,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1127.32,1376.12 +256,8192,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,209.0267,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1315.04,963.16 +256,8192,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,337.4416,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1425.54,745.78 +256,8192,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,630.3348,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1526.29,585.56 +256,8192,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,713.5721,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1540.86,564.28 +256,8192,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2497.3336,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1540.96,429.96 +256,16384,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,236.8495,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1160.56,1345.86 +256,16384,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,402.8317,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1364.73,916.26 +256,16384,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,657.6327,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1462.93,676.06 +256,16384,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1244.7931,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1545.76,498.68 +256,16384,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1410.2403,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1559.33,475.87 +256,16384,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5002.769,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1538.46,335.36 +256,32768,8192,1024,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,463.0824,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1187.17,1340.49 +256,32768,8192,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,791.1639,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1389.74,890.64 +256,32768,8192,3584,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1300.2138,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1479.87,638.72 +256,32768,8192,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2489.0786,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1546.07,451.6 +256,32768,8192,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2825.562,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1556.52,427.51 +256,32768,8192,28672,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,10090.83,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1525.46,285.97 +256,1,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,134,8,32.691,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0223,5.13,5133.19 +256,16,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,134,8,36.3724,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0212,73.8,4628.84 +256,32,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,39.5825,auto,0.0,135.63,4268.35 +256,64,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,221,2,53.4577,flydsl_gemm2_abf16_wbf16_bf16_t64x128x128_split_k2_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0062,200.86,3182.54 +256,128,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,69.083,auto,0.0,310.86,2496.86 +256,256,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,84.2787,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,509.61,2102.66 +256,512,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,142.8453,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,601.35,1306.63 +256,1024,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,156.5017,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1097.74,1313.22 +256,2048,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,314.5911,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1092.2,773.29 +256,4096,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,507.5975,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1353.82,627.99 +256,8192,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,973.6448,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1411.59,482.48 +256,16384,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1927.7017,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1425.94,400.35 +256,32768,10240,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,3846.6163,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1429.19,357.65 +256,1,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,164,8,45.5191,flydsl_gemm2_abf16_wbf16_bf16_t32x256x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0234,5.16,5161.04 +256,16,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,41.3701,auto,0.0,90.84,5694.98 +256,32,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,43.433,auto,0.0,173.05,5441.09 +256,64,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,59.5058,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,252.62,3995.65 +256,128,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,77.6763,auto,0.0,387.05,3098.09 +256,256,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,107.7922,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,557.83,2286.02 +256,512,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,147.9651,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,812.75,1743.31 +256,1024,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,183.6583,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1309.6,1530.12 +256,2048,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,344.2741,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1397.25,950.28 +256,4096,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,672.6336,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1430.31,623.56 +256,8192,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1250.6163,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1538.56,482.95 +256,16384,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2489.7903,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1545.63,390.83 +256,32768,14336,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,4995.7199,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1540.64,342.55 +256,1,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,161.0117,auto,0.0,5.84,5835.94 +256,16,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,155.7742,auto,0.0,96.5,6044.78 +256,32,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,165.8506,auto,0.0,181.28,5690.17 +256,64,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,250,1,174.1356,flydsl_gemm2_abf16_wbf16_bf16_t64x256x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,345.3,5443.53 +256,128,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,20,1,193.2654,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,622.25,4948.12 +256,256,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,219.6217,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1095.15,4430.7 +256,512,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,381.9333,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1259.48,2635.63 +256,1024,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,693.4583,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1387.35,1548.39 +256,2048,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1276.0101,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1507.94,946.67 +256,4096,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2510.3525,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1532.97,588.12 +256,8192,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5008.9325,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1536.57,401.94 +256,16384,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,10204.9796,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1508.4,302.5 +256,32768,57344,8192,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,21168.2097,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1454.37,247.28 diff --git a/aiter/configs/model_configs/qwen32B_bf16_tuned_gemm.csv b/aiter/configs/model_configs/qwen32B_bf16_tuned_gemm.csv index 7e64b7133a..475fdd74eb 100644 --- a/aiter/configs/model_configs/qwen32B_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/qwen32B_bf16_tuned_gemm.csv @@ -1,157 +1,157 @@ cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw -256,1,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,7.8782,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,0.13,131.3 -256,16,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439772,0,7.8731,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,2.08,151.28 -256,32,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,8.6024,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,3.81,157.87 -256,64,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,8.5931,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,7.63,196.92 -256,128,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440159,0,8.272,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,15.85,285.34 -256,256,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440159,0,8.44,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,31.06,437.99 -256,512,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440159,0,8.4448,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,62.08,754.23 -256,1024,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439552,0,11.4461,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,91.61,1023.45 -256,2048,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440043,0,17.0309,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,123.14,1315.56 -256,4096,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440247,0,19.8299,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC3_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,211.51,2208.09 -256,8192,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439712,0,26.1299,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,321.03,3312.24 -256,16384,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440266,0,40.5725,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB1_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,413.51,4241.12 -256,32768,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440677,0,64.2196,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,522.5,5342.95 -256,1,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439633,0,7.9325,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,0.26,259.52 -256,16,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440303,0,8.2209,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,3.99,269.83 -256,32,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,8.6174,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,7.61,277.17 -256,64,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,8.5341,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,15.36,319.77 -256,128,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,8.6099,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,30.45,396.05 -256,256,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440159,0,8.8701,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,59.11,537.97 -256,512,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440162,0,13.1134,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC3_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2,0.0,79.96,571.6 -256,1024,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440174,0,17.8971,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,117.18,723.21 -256,2048,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439385,0,19.9217,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC2_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,210.54,1196.62 -256,4096,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440263,0,23.8764,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,351.33,1911.07 -256,8192,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439924,0,35.2051,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,476.56,2534.03 -256,16384,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440678,0,54.3605,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,617.26,3244.52 -256,32768,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441036,0,89.8285,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,747.08,3904.1 -256,1,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440287,0,8.2747,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,0.99,991.44 -256,16,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440041,0,8.5973,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,15.25,974.89 -256,32,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440181,0,9.3413,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC3_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,28.06,917.53 -256,64,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441420,0,11.5678,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4,0.0,45.32,773.68 -256,128,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440641,0,16.4653,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,63.68,589.57 -256,256,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440599,0,17.4227,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA1_NTB1_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,120.37,644.16 -256,512,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439593,0,19.6213,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,213.76,726.46 -256,1024,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440109,0,23.1102,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA2_NTB4_NTC2_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,362.98,879.1 -256,2048,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439924,0,31.3993,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,534.32,1033.15 -256,4096,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440199,0,46.6004,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,720.05,1216.48 -256,8192,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441307,0,76.0186,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,882.8,1383.68 -256,16384,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441354,0,130.4167,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT208x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1029.15,1550.25 -256,32768,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441289,0,243.9808,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1100.23,1623.76 -256,1,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,2,1,6.9902,_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,0.0,0.94,939.19 -256,1,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440554,0,7.5725,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC2_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,1.73,1732.59 -256,1,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.6919,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0076,2.8,2804.05 -256,1,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,122,16,12.4189,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0281,4.22,4223.34 -256,1,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,17.6089,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0096,3.72,3723.06 -256,1,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440307,0,44.5404,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1,0.0,5.89,5886.91 -256,16,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.2193,auto,0.0,14.52,933.32 -256,16,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,7.5339,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,27.84,1766.95 -256,16,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.2735,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,42.72,2691.51 -256,16,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.905,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0203,60.33,3794.07 -256,16,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439634,0,18.082,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC1_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1,0.0,57.99,3644.76 -256,16,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,45.7963,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,91.59,5745.6 -256,32,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,7.4363,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0014,28.2,930.87 -256,32,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439705,0,8.1447,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,51.5,1659.58 -256,32,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.8238,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0078,81.77,2596.77 -256,32,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,16.1933,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0091,103.61,3278.16 -256,32,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,19.8551,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.01,105.62,3337.85 -256,32,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439912,0,47.4404,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1,0.0,176.82,5567.2 -256,64,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,2,1,7.9464,_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,0.0,52.78,917.51 -256,64,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.95,auto,0.0,93.73,1556.02 -256,64,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,13.6874,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.008,153.22,2471.83 -256,64,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,17.4742,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0093,192.02,3075.36 -256,64,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,20.0034,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.01,209.68,3349.96 -256,64,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440860,0,50.9005,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,329.61,5227.38 -256,128,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.3244,auto,0.0,114.53,1096.08 -256,128,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,10.9761,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,152.85,1343.43 -256,128,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439640,0,18.3007,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC3_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,229.19,1906.92 -256,128,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440177,0,21.2988,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,315.08,2584.66 -256,128,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440330,0,24.364,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,344.3,2810.91 -256,128,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441367,0,60.0469,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,558.8,4496.62 -256,256,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439439,0,8.6844,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,193.19,1094.23 -256,256,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,12.642,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,265.42,1296.0 -256,256,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440513,0,20.6512,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,406.2,1793.01 -256,256,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440177,0,27.57,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,486.83,2091.83 -256,256,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440177,0,31.4887,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,532.8,2268.57 -256,256,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441305,0,82.4325,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,814.11,3370.91 -256,512,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,295,1,11.9433,flydsl_gemm2_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,280.95,1042.58 -256,512,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,16.5314,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,405.95,1189.3 -256,512,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439930,0,26.3863,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA3_NTB1_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,635.83,1564.74 -256,512,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440128,0,40.582,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA2_NTB2_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,661.46,1550.31 -256,512,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439720,0,48.0074,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,698.94,1610.84 -256,512,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,132.0767,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1016.21,2222.96 -256,1024,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,15.2257,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,440.76,1205.2 -256,1024,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440010,0,21.0484,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA2_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,637.66,1245.43 -256,1024,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440211,0,39.4204,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB3_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,851.19,1263.49 -256,1024,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441315,0,64.7422,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,829.24,1133.73 -256,1024,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439562,0,75.3599,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,890.51,1182.71 -256,1024,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440791,0,241.1581,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1113.11,1347.91 -256,2048,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440012,0,19.2509,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,697.2,1565.98 -256,2048,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439563,0,28.7925,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA6_NTB1_NTC2_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,932.31,1365.69 -256,2048,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440570,0,63.4339,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB2_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1057.93,1053.8 -256,2048,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441305,0,104.1995,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1030.47,905.68 -256,2048,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440465,0,125.3499,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1070.74,899.26 -256,2048,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439562,0,430.228,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1247.88,901.78 -256,4096,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440075,0,33.5611,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,799.84,1601.24 -256,4096,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439667,0,55.7664,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,962.71,1175.19 -256,4096,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439667,0,126.9173,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1057.52,795.21 -256,4096,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440570,0,191.3972,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB2_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1122.0,712.21 -256,4096,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439667,0,235.1316,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1141.64,680.08 -256,4096,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439562,0,842.7454,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,1274.1,609.68 -256,8192,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439738,0,58.607,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB3_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,916.05,1722.07 -256,8192,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440012,0,100.1325,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1072.32,1178.09 -256,8192,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,214.5098,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1251.39,788.23 -256,8192,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,325.1496,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1320.92,677.23 -256,8192,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,390.312,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1375.49,651.48 -256,8192,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440614,0,1432.3923,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1499.23,534.39 -256,16384,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440622,0,112.4117,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,955.19,1737.34 -256,16384,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440282,0,177.9873,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1206.54,1251.9 -256,16384,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,383.8016,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1398.82,795.72 -256,16384,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440528,0,578.0575,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1486.0,671.17 -256,16384,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441281,0,707.7525,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1517.11,625.96 -256,16384,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440333,0,2737.4956,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC5_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1568.94,463.48 -256,32768,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441460,0,209.1223,Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950,0.0,1026.9,1836.44 -256,32768,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440588,0,340.8883,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1259.93,1268.85 -256,32768,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,734.3333,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1462.2,787.15 -256,32768,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441281,0,1129.886,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1520.5,640.35 -256,32768,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440282,0,1391.9543,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1542.78,589.47 -256,32768,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440333,0,5550.579,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC5_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1547.57,409.94 -256,1,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,13.88,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0175,4.72,4723.27 -256,16,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,15.9358,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0123,65.8,4135.63 -256,32,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,18.6325,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.012,112.55,3556.86 -256,64,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439711,0,21.4847,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA1_NTB0_NTC3_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1,0.0,195.22,3118.99 -256,128,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440339,0,24.1627,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,347.17,2834.33 -256,256,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441293,0,33.4677,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,501.3,2134.42 -256,512,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439861,0,47.4796,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x112x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,706.71,1628.75 -256,1024,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441300,0,71.3668,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,940.34,1248.89 -256,2048,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439452,0,116.1196,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1155.86,970.74 -256,4096,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440507,0,221.2645,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1213.19,722.7 -256,8192,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440656,0,410.1949,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1308.82,619.9 -256,16384,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440271,0,759.8253,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1413.14,583.06 -256,32768,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,1433.3354,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1498.24,572.45 -256,1,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439843,0,24.7644,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB2_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,5.29,5294.21 -256,16,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440291,0,23.8402,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,87.97,5521.99 -256,32,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440651,0,26.2863,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,159.56,5029.95 -256,64,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440263,0,30.706,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,273.19,4343.31 -256,128,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440341,0,38.9554,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,430.68,3482.43 -256,256,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440199,0,52.1604,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1,0.0,643.29,2688.76 -256,512,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441293,0,80.6816,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,831.77,1852.0 -256,1024,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,117.1051,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1146.13,1432.66 -256,2048,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,220.2568,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1218.74,928.34 -256,4096,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,418.3034,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1283.45,664.28 -256,8192,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,756.7141,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1418.95,561.21 -256,16384,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440587,0,1427.7239,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1504.13,503.09 -256,32768,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,441281,0,2787.5053,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1540.79,468.33 -256,1,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439704,0,80.714,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2,0.0,6.5,6497.02 -256,16,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440849,0,81.1557,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,103.36,6482.48 -256,32,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440849,0,81.4925,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,205.87,6477.8 -256,64,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440848,0,83.8,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,400.41,6342.45 -256,128,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440779,0,92.0317,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,729.19,5853.48 -256,256,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439471,0,127.7778,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB1_NTC5_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1050.4,4328.79 -256,512,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,229.3064,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1170.64,2537.91 -256,1024,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,443.2279,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1211.28,1443.12 -256,2048,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439532,0,785.4766,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1366.99,961.17 -256,4096,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440510,0,1438.3188,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1493.05,685.29 -256,8192,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439609,0,2778.2855,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1545.91,520.84 -256,16384,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,439464,0,5567.7519,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1542.8,425.63 -256,32768,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,hipblaslt,440454,0,11384.5906,Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1,0.0,1509.05,370.26 +256,1,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,24.2594,auto,0.0,0.04,42.64 +256,16,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,18.9179,auto,0.0,0.87,62.96 +256,32,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,20.665,auto,0.0,1.59,65.72 +256,64,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.9084,auto,0.0,2.99,77.24 +256,128,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,31.8414,auto,0.0,4.12,74.13 +256,256,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,49.0384,auto,0.0,5.35,75.38 +256,512,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,50.1618,auto,0.0,10.45,126.97 +256,1024,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,132.7156,auto,0.0,7.9,88.27 +256,2048,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,131.8115,auto,0.0,15.91,169.98 +256,4096,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,101.6103,auto,0.0,41.28,430.92 +256,8192,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,101.715,auto,0.0,82.47,850.89 +256,16384,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,103.0716,auto,0.0,162.77,1669.45 +256,32768,100,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,110.6268,auto,0.0,303.31,3101.62 +256,1,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,24.9311,auto,0.0,0.08,82.57 +256,16,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,19.4155,auto,0.0,1.69,114.25 +256,32,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.489,auto,0.0,3.05,111.15 +256,64,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,22.9195,auto,0.0,5.72,119.07 +256,128,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,33.6484,auto,0.0,7.79,101.34 +256,256,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,48.3964,auto,0.0,10.83,98.6 +256,512,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,46.1295,auto,0.0,22.73,162.49 +256,1024,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,153.9655,auto,0.0,13.62,84.07 +256,2048,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,152.8232,auto,0.0,27.45,155.99 +256,4096,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,106.0943,auto,0.0,79.07,430.08 +256,8192,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,107.2911,auto,0.0,156.37,831.48 +256,16384,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,109.2356,auto,0.0,307.17,1614.62 +256,32768,200,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,120.4976,auto,0.0,556.93,2910.43 +256,1,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.9988,auto,0.0,0.34,341.84 +256,16,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,19.3848,auto,0.0,6.76,432.37 +256,32,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.7693,auto,0.0,12.04,393.71 +256,64,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,23.8795,auto,0.0,21.96,374.79 +256,128,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,33.711,auto,0.0,31.1,287.96 +256,256,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,47.7622,auto,0.0,43.91,234.98 +256,512,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.1338,auto,0.0,92.93,315.82 +256,1024,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,169.7397,auto,0.0,49.42,119.69 +256,2048,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,170.1752,auto,0.0,98.59,190.63 +256,4096,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,111.1677,auto,0.0,301.84,509.94 +256,8192,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,117.7072,auto,0.0,570.13,893.62 +256,16384,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,227.3613,auto,0.0,590.33,889.24 +256,32768,800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,433.4434,auto,0.0,619.31,914.0 +256,1,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,7.0164,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0012,0.93,935.68 +256,1,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,129,4,7.8429,flydsl_gemm2_abf16_wbf16_bf16_t32x128x64_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.007,1.67,1672.85 +256,1,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,11.5446,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.008,2.84,2839.82 +256,1,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,232,16,12.6295,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0311,4.15,4152.92 +256,1,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,17.8122,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0098,3.68,3680.57 +256,1,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,45.3626,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0342,5.78,5780.21 +256,16,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,5.9722,auto,0.0,17.56,1128.21 +256,16,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,7.4685,auto,0.0,28.08,1782.42 +256,16,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.4765,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.008,42.02,2647.72 +256,16,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,13.6344,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0207,61.53,3869.37 +256,16,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,18.6276,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0098,56.29,3538.01 +256,16,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,49.8337,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0247,84.17,5280.1 +256,32,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,170,1,7.3371,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,28.58,943.46 +256,32,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.159,auto,0.0,51.41,1656.67 +256,32,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,12.575,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.008,83.39,2648.15 +256,32,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,16.2461,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0091,103.27,3267.5 +256,32,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,3,18.8341,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.01,111.35,3518.79 +256,32,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,53.5449,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0244,156.66,4932.5 +256,64,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,168,1,7.2458,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,57.89,1006.22 +256,64,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,8.6051,auto,0.0,97.48,1618.39 +256,64,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,13.7745,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0079,152.25,2456.2 +256,64,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,17.7938,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0093,188.57,3020.13 +256,64,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,20.3857,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.01,205.75,3287.14 +256,64,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,62.1629,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0134,269.89,4280.3 +256,128,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,6.8391,auto,0.0,122.66,1173.86 +256,128,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,10.461,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,160.38,1409.58 +256,128,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,20.7104,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,202.52,1685.04 +256,128,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,31.7222,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,211.55,1735.39 +256,128,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,4,1,38.2094,_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,0.0,219.54,1792.36 +256,128,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,84,8,96.9584,flydsl_gemm2_abf16_wbf16_bf16_t128x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0251,346.07,2784.79 +256,256,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,8.4883,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,197.65,1119.51 +256,256,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,12.1986,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,275.07,1343.1 +256,256,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,23.8446,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,351.8,1552.88 +256,256,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,35.5373,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,377.68,1622.85 +256,256,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,9,1,43.5663,_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,0.0,385.1,1639.67 +256,256,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,26,4,153.6203,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0168,436.85,1808.83 +256,512,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,10.3599,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,323.89,1201.93 +256,512,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,16.5475,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,405.55,1188.14 +256,512,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,35.1105,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,477.84,1175.94 +256,512,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,54.7178,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,490.58,1149.8 +256,512,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,67.6879,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,495.72,1142.49 +256,512,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,365,1,245.2234,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,547.33,1197.28 +256,1024,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,15.3486,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,437.23,1195.55 +256,1024,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,26.5392,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,505.73,987.76 +256,1024,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,58.5966,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,572.63,850.0 +256,1024,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,89.5148,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,599.76,819.98 +256,1024,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,107.3896,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,624.91,829.96 +256,1024,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,325,1,389.4198,flydsl_gemm2_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,689.32,834.73 +256,2048,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,22.9328,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,585.27,1314.56 +256,2048,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,35.9027,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,747.67,1095.23 +256,2048,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,70.0071,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,958.6,954.86 +256,2048,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,108.3905,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,990.62,870.67 +256,2048,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,126.7131,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1059.23,889.58 +256,2048,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,456.1972,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1176.84,850.45 +256,4096,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,43.3636,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,619.03,1239.28 +256,4096,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,69.4435,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,773.1,943.73 +256,4096,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,134.616,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,997.04,749.73 +256,4096,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,206.706,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1038.91,659.46 +256,4096,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,244.7736,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1096.67,653.29 +256,4096,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,968.3405,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1108.85,530.6 +256,8192,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,67.9081,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,790.58,1486.21 +256,8192,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,107.7181,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,996.81,1095.13 +256,8192,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,214.2474,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1252.92,789.19 +256,8192,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,326.4603,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1315.62,674.51 +256,8192,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,392.5128,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1367.78,647.83 +256,8192,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1518.2312,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1414.46,504.18 +256,16384,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,114.5126,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,937.66,1705.47 +256,16384,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,181.7006,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1181.88,1226.32 +256,16384,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,385.0646,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1394.24,793.11 +256,16384,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,587.1472,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1463.0,660.78 +256,16384,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,715.1868,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1501.34,619.45 +256,16384,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2808.6562,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1529.19,451.74 +256,32768,5120,640,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,222.8413,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,963.68,1723.38 +256,32768,5120,1280,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,358.6914,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1197.4,1205.88 +256,32768,5120,3200,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,758.3608,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1415.87,762.21 +256,32768,5120,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1171.8545,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1466.04,617.41 +256,32768,5120,6400,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1431.8652,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1499.78,573.04 +256,32768,5120,25600,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5625.3694,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1527.0,404.49 +256,1,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,13.997,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0184,4.68,4683.79 +256,16,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,15.9482,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0121,65.75,4132.42 +256,32,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,18.5129,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0122,113.28,3579.84 +256,64,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,22.5849,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0052,185.71,2967.05 +256,128,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,32.7099,auto,0.0,256.45,2093.71 +256,256,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,46.8385,auto,0.0,358.19,1525.12 +256,512,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,68.9628,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,486.56,1121.37 +256,1024,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,20,1,103.7585,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,646.78,859.0 +256,2048,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,124.2504,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1080.22,907.22 +256,4096,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,227.3241,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1180.85,703.44 +256,8192,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,437.9952,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1225.75,580.55 +256,16384,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,799.8221,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1342.48,553.9 +256,32768,6400,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1604.0758,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1338.77,511.52 +256,1,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,116,4,25.0392,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0127,5.23,5236.1 +256,16,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,27.453,auto,0.0,76.39,4795.3 +256,32,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,29.9037,auto,0.0,140.26,4421.49 +256,64,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,37.0414,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,226.47,3600.45 +256,128,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,51.7913,auto,0.0,323.94,2619.35 +256,256,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3,1,72.663,flydsl_gemm2_abf16_wbf16_bf16_t128x128x128_split_k1_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,461.78,1930.1 +256,512,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,103.9297,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,645.71,1437.72 +256,1024,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,115.6173,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1160.88,1451.1 +256,2048,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,218.7234,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1227.28,934.84 +256,4096,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,419.7581,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1279.0,661.98 +256,8192,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,757.9903,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1416.56,560.26 +256,16384,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1489.2465,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1441.99,482.31 +256,32768,12800,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2902.235,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1479.88,449.82 +256,1,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,110,1,87.0947,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,6.02,6021.04 +256,16,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,110,1,90.7763,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,92.41,5795.46 +256,32,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,110,1,94.9692,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,176.66,5558.57 +256,64,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,235,1,107.7019,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k1_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,311.55,4934.89 +256,128,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,123.2362,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,544.55,4371.33 +256,256,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,138.7156,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,967.57,3987.47 +256,512,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,229.1527,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1171.43,2539.62 +256,1024,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,442.0268,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1214.57,1447.04 +256,2048,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,795.0722,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1350.5,949.57 +256,4096,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,1463.0452,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1467.82,673.71 +256,8192,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,2913.1928,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1474.32,496.72 +256,16384,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,5818.0865,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1476.42,407.31 +256,32768,51200,5120,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,11574.4717,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1484.29,364.19