diff --git a/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv b/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv index da50bbaeda..9cc183aba4 100644 --- a/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv @@ -25,17 +25,14 @@ cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,ke 256,16,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,11.9934,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0034,39.17,2472.61 256,16,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.0337,auto,0.0,57.0,3573.78 256,32,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,14,7.7283,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0288,15.2,536.36 -256,32,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,6,14.294,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0189,67.78,2159.76 256,32,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,4,8.7013,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0075,34.71,1118.46 256,32,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,12.3935,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0034,75.81,2416.59 256,32,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,64.518,auto,0.0,114.9,3613.92 256,48,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,14,7.9838,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0282,22.06,548.95 -256,48,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,6,14.8295,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0181,98.0,2101.79 256,48,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,3,9.043,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0058,50.09,1092.51 256,48,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,2,13.0225,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0034,108.22,2322.51 256,48,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.2215,auto,0.0,170.5,3586.38 256,64,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,7.9784,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0302,29.44,579.1 -256,64,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,15.3187,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.018,126.5,2054.06 256,64,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,9.2849,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0078,65.05,1079.93 256,64,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,13.5831,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0034,138.34,2248.37 256,64,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.4327,auto,0.0,226.6,3586.22 @@ -45,7 +42,6 @@ cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,ke 256,80,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,14.0021,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0034,167.75,2202.15 256,80,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,84.1977,auto,0.0,220.12,2795.83 256,96,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,16,8.6162,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0321,40.89,591.38 -256,96,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,6,16.4247,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.018,176.97,1951.9 256,96,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,9.668,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0076,93.71,1067.64 256,96,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,14.2839,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0034,197.33,2179.35 256,96,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,86.0249,auto,0.0,258.53,2745.12 diff --git a/aiter/configs/model_configs/dsv3_bf16_untuned_gemm.csv b/aiter/configs/model_configs/dsv3_bf16_untuned_gemm.csv new file mode 100644 index 0000000000..f2d7ca683d --- /dev/null +++ b/aiter/configs/model_configs/dsv3_bf16_untuned_gemm.csv @@ -0,0 +1,70 @@ +M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle +1,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +1,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +1,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +1,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +1,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False +2,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +2,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +2,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +2,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +2,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False +4,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +4,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +4,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +4,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +4,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False +8,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +8,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +8,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +8,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +8,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False +16,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +16,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +16,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +16,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +16,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False +32,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +32,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +32,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +32,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +32,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False +48,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +48,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +48,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +48,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +48,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False +64,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +64,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +64,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +64,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +64,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False +80,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +80,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +80,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +80,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +80,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False +96,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +96,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +96,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +96,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +96,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False +112,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +112,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +112,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +112,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +112,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False +128,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +128,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +128,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +128,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +128,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False +256,256,7168,False,torch.bfloat16,torch.bfloat16,False,False +256,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +256,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +256,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False +256,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False + + + + diff --git a/aiter/configs/model_configs/dsv3_fp4_untuned_fmoe.csv b/aiter/configs/model_configs/dsv3_fp4_untuned_fmoe.csv new file mode 100644 index 0000000000..a9426183e9 --- /dev/null +++ b/aiter/configs/model_configs/dsv3_fp4_untuned_fmoe.csv @@ -0,0 +1,12 @@ +token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1 +32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +64,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +128,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +512,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2048,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4096,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8192,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16384,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32768,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 diff --git a/aiter/configs/model_configs/gptoss_bf16_untuned_gemm.csv b/aiter/configs/model_configs/gptoss_bf16_untuned_gemm.csv new file mode 100644 index 0000000000..2d1708e498 --- /dev/null +++ b/aiter/configs/model_configs/gptoss_bf16_untuned_gemm.csv @@ -0,0 +1,58 @@ +M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle +1,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +1,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False +1,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False +1,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False +1,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False +2,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +2,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False +2,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False +2,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False +2,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False +4,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +4,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False +4,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False +4,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False +4,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False +8,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +8,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False +8,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False +8,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False +8,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False +16,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +16,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False +16,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False +16,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False +16,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False +32,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +32,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False +32,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False +32,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False +32,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False +48,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +48,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False +48,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False +48,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False +48,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False +64,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +64,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False +64,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False +64,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False +64,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False +80,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +80,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False +80,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False +96,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +96,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False +96,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False +112,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +112,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False +112,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False +128,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +128,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False +128,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False +128,2880,4096,True,torch.bfloat16,torch.bfloat16,False,False +128,5120,2880,True,torch.bfloat16,torch.bfloat16,False,False +256,128,2880,True,torch.bfloat16,torch.bfloat16,False,False +256,2560,2880,True,torch.bfloat16,torch.bfloat16,False,False +256,2880,2048,True,torch.bfloat16,torch.bfloat16,False,False diff --git a/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv b/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv index 3046d4fc94..111fdf39c8 100644 --- a/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv +++ b/aiter/configs/model_configs/kimik2_bf16_tuned_gemm.csv @@ -106,7 +106,6 @@ cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,ke 256,56,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,15.0293,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0176,112.82,2083.73 256,64,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,6,14.6227,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.018,132.52,2151.82 256,72,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,7,15.0844,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0198,144.52,2095.8 -256,80,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,7,16.0087,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0195,151.31,1984.07 256,88,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,7,16.1244,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.02,165.24,1979.04 256,96,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,6,16.2338,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0182,179.05,1974.85 256,176,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,10.6186,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0028,156.42,1041.49 diff --git a/aiter/configs/model_configs/kimik2_bf16_untuned_gemm.csv b/aiter/configs/model_configs/kimik2_bf16_untuned_gemm.csv new file mode 100644 index 0000000000..9a8a1e649f --- /dev/null +++ b/aiter/configs/model_configs/kimik2_bf16_untuned_gemm.csv @@ -0,0 +1,126 @@ +M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle +1,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +2,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +4,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +8,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +16,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +24,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +32,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +40,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +48,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +56,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +64,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +72,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +80,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +88,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +96,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +104,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +112,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +168,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +176,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +184,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +192,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +200,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +208,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +216,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +224,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +232,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +240,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +248,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +256,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +264,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +272,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +280,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +288,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +296,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +304,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +312,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +320,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +328,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +336,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +344,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +352,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +360,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +368,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +376,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +384,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +392,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +400,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +408,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +416,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +424,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +432,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +440,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +448,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +456,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +464,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +472,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +480,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +488,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +496,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +504,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +512,384,7168,False,torch.bfloat16,torch.bfloat16,False,False +72,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +80,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +88,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +96,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +104,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +112,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +120,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +128,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +136,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +144,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +152,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +160,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +168,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +176,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +184,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +192,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +200,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +208,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +216,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +224,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +232,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +240,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +248,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +256,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +264,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +272,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +280,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +288,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +296,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +304,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +312,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +320,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +328,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +336,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +344,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +352,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +360,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +368,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +376,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +384,1024,7168,False,torch.bfloat16,torch.bfloat16,False,False +32,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +40,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +48,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +56,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +64,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +72,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +88,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +96,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False +176,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +184,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +192,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False +80,4096,512,False,torch.bfloat16,torch.bfloat16,False,False +320,4096,512,False,torch.bfloat16,torch.bfloat16,False,False +40,7168,512,False,torch.bfloat16,torch.bfloat16,False,False +56,7168,512,False,torch.bfloat16,torch.bfloat16,False,False +72,7168,512,False,torch.bfloat16,torch.bfloat16,False,False +80,7168,512,False,torch.bfloat16,torch.bfloat16,False,False +96,7168,512,False,torch.bfloat16,torch.bfloat16,False,False +104,7168,512,False,torch.bfloat16,torch.bfloat16,False,False +112,7168,512,False,torch.bfloat16,torch.bfloat16,False,False +128,7168,512,False,torch.bfloat16,torch.bfloat16,False,False +168,7168,512,False,torch.bfloat16,torch.bfloat16,False,False +192,7168,512,False,torch.bfloat16,torch.bfloat16,False,False +80,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False diff --git a/aiter/configs/model_configs/kimik2_fp4_untuned_fmoe.csv b/aiter/configs/model_configs/kimik2_fp4_untuned_fmoe.csv new file mode 100644 index 0000000000..0073b7a109 --- /dev/null +++ b/aiter/configs/model_configs/kimik2_fp4_untuned_fmoe.csv @@ -0,0 +1,147 @@ +token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1 +1,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +24,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +24,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +40,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +40,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +48,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +48,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +56,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +56,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +64,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +64,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +72,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +72,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +80,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +80,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +88,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +88,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +96,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +96,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +104,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +104,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +112,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +112,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +120,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +120,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +128,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +128,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +136,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +136,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +144,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +144,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +152,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +152,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +160,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +160,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +168,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +168,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +176,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +176,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +184,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +184,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +192,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +192,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +200,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +200,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +208,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +208,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +216,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +216,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +224,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +224,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +232,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +232,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +240,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +240,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +248,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +248,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +256,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +256,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +264,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +264,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +272,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +272,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +280,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +280,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +288,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +288,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +296,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +296,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +304,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +304,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +312,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +312,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +320,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +320,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +328,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +328,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +336,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +336,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +344,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +344,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +352,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +352,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +360,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +360,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +368,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +368,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +376,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +376,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +384,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +384,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +392,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +392,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +400,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +400,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +408,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +408,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +416,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +416,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +424,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +424,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +432,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +432,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +440,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +440,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +448,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +448,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +456,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +456,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +464,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +464,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +472,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +472,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +480,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +480,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +488,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +488,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +496,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +496,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +504,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +504,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +512,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +512,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1024,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1024,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2048,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2048,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4096,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4096,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8192,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8192,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16384,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16384,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32768,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32768,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 diff --git a/aiter/jit/core.py b/aiter/jit/core.py index 3b053e0d3a..8e0bc9cb2f 100644 --- a/aiter/jit/core.py +++ b/aiter/jit/core.py @@ -183,29 +183,35 @@ def update_config_files(self, file_path: str, merge_name: str): path_list = file_path.split(os.pathsep) if file_path else [] if len(path_list) <= 1: return file_path - df_list = [] + source_pairs = [] ## merge config files ##example: AITER_CONFIG_GEMM_A4W4="/path1:/path2" import pandas as pd - df_list.append(pd.read_csv(path_list[0])) - for i, path in enumerate(path_list[1:]): - if os.path.exists(path): - df = pd.read_csv(path) - base_cols = [c for c in df_list[0].columns if c != "_tag"] + for i, path in enumerate(path_list): + if not os.path.exists(path): + logger.info(f"path {i+1}: {path} (not exist)") + continue + + df = pd.read_csv(path) + if source_pairs: + base_path, base_df = source_pairs[0] + base_cols = [c for c in base_df.columns if c != "_tag"] new_cols = [c for c in df.columns if c != "_tag"] - assert ( - base_cols == new_cols - ), f"Column mismatch between {path_list[0]} and {path}, {base_cols}, {new_cols}" + if base_cols != new_cols: + raise ValueError( + f"Column mismatch between {base_path} and {path}, " + f"{base_cols}, {new_cols}" + ) - df_list.append(df) - else: - logger.info(f"path {i+1}: {path} (not exist)") - merge_df = ( - pd.concat([df for df in df_list if not df.empty], ignore_index=True) - if df_list - else pd.DataFrame() - ) + source_pairs.append((path, df)) + + if not source_pairs: + raise FileNotFoundError( + f"No existing config files found in '{file_path}' when merging '{merge_name}'." + ) + + merge_df = pd.concat([df for _, df in source_pairs], ignore_index=True) has_tag = "_tag" in merge_df.columns if has_tag: merge_df["_tag"] = merge_df["_tag"].fillna("") @@ -223,17 +229,49 @@ def update_config_files(self, file_path: str, merge_name: str): if "cu_num" not in keys: keys.append("cu_num") dedup_keys = keys + ["_tag"] if has_tag else keys - sorted_df = merge_df.sort_values("us") - duplicated_mask = sorted_df.duplicated(subset=dedup_keys, keep="first") + duplicated_mask = merge_df.duplicated(subset=dedup_keys, keep=False) if duplicated_mask.any(): - dup_rows = sorted_df[duplicated_mask] - logger.warning( - f"Dropping {len(dup_rows)} duplicate rows during merge of '{merge_name}':\n" - f"{dup_rows.to_string(index=False)}" + dup_count = int(duplicated_mask.sum()) + dup_rows = merge_df[duplicated_mask].sort_values(dedup_keys) + if "us" not in merge_df.columns: + raise RuntimeError( + f"Found {dup_count} duplicate shape entries during merge of '{merge_name}'. " + f"No 'us' column to determine best performing entry. " + f"Please remove duplicates manually.\n" + f"Duplicate rows:\n{dup_rows.to_string(index=False)}" + ) + + # Auto-dedup: globally determine best row (lowest 'us') per shape + best_row_index = set( + merge_df.sort_values("us", kind="stable") + .drop_duplicates(subset=dedup_keys, keep="first") + .index + ) + + saved_files = [] + offset = 0 + for src_path, src_df in source_pairs: + start, end = offset, offset + len(src_df) + offset = end + file_rows = merge_df.iloc[start:end] + new_src_df = file_rows[ + file_rows.index.isin(best_row_index) + ].reset_index(drop=True) + if len(new_src_df) < len(src_df): + new_src_df.to_csv(src_path, index=False) + saved_files.append( + f" {src_path}: {len(src_df)} -> {len(new_src_df)} rows" + ) + saved_info = ( + "\n".join(saved_files) if saved_files else " (no files updated)" + ) + raise RuntimeError( + f"Found {dup_count} duplicate shape entries during merge of '{merge_name}'. " + f"Auto-resolved by keeping best performing (lowest 'us') for each shape " + f"and saved back to source config files. Please re-run.\n" + f"Duplicate rows:\n{dup_rows.to_string(index=False)}\n" + f"Updated files:\n{saved_info}" ) - merge_df = sorted_df.drop_duplicates( - subset=dedup_keys, keep="first" - ).reset_index(drop=True) else: logger.warning( f"Untuned config file not found: {untuned_path}. Using all columns for deduplication."