diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_qwen3_235b.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_qwen3_235b.csv new file mode 100644 index 0000000000..70cb54e3b8 --- /dev/null +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_qwen3_235b.csv @@ -0,0 +1,11 @@ +cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw +256,1,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,67.4568,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,67.4568,1,4.48,4476.97 +256,2,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,71.1363,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,71.1363,1,8.49,4245.57 +256,4,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,74.8513,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,74.8513,1,16.14,4035.19 +256,8,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,78.1561,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,78.1561,1,30.91,3865.19 +256,16,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,78.262,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,78.262,1,61.74,3861.22 +256,32,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,84.6249,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,84.6249,1,114.19,3573.22 +256,64,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,96.2795,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,96.2795,1,200.74,3144.76 +256,128,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,145.6013,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,145.6013,1,265.48,2084.89 +256,256,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,237.6176,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,237.6176,1,325.35,1284.15 +256,512,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,331.0423,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,331.0423,1,467.07,931.24 diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_235b.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_235b.csv new file mode 100644 index 0000000000..23128eb19a --- /dev/null +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_gemm_qwen3_235b.csv @@ -0,0 +1,129 @@ +cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio +256,1,9216,4096,8,0,15.2874,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.94,2470.74,0.0 +256,2,9216,4096,8,0,15.6182,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.67,2419.86,0.0 +256,4,9216,4096,8,0,15.5108,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.47,2439.52,0.0 +256,8,9216,4096,8,0,15.6617,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,38.56,2421.77,0.0 +256,16,9216,4096,8,0,14.3736,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,84.04,2651.33,0.0 +256,32,9216,4096,7,0,15.5721,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,155.14,2470.42,0.0 +256,64,9216,4096,18,0,15.8652,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,304.56,2470.22,0.0 +256,128,9216,4096,18,0,24.2965,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,397.74,1672.35,0.0 +256,256,9216,4096,18,0,31.8855,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,606.15,1364.76,0.0 +256,512,9216,4096,18,0,47.4923,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,813.92,1037.71,0.0 +256,1024,9216,4096,2,0,77.3153,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,999.92,786.62,0.0 +256,2048,9216,4096,0,0,126.3825,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1223.42,663.75,0.0 +256,4096,9216,4096,0,0,220.1706,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1404.54,590.56,0.0 +256,8192,9216,4096,0,0,400.8616,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1542.86,554.55,0.0 +256,16384,9216,4096,0,0,800.8525,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1544.54,508.02,0.0 +256,32768,9216,4096,0,0,1653.9011,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1495.8,469.16,0.0 +256,1,4096,8192,8,0,26.8493,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.5,1250.34,0.0 +256,2,4096,8192,8,0,26.8722,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.99,1249.89,0.0 +256,4,4096,8192,8,0,27.0942,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.91,1240.85,0.0 +256,8,4096,8192,8,0,27.4449,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.56,1227.39,0.0 +256,16,4096,8192,8,0,24.3021,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,44.18,1391.51,0.0 +256,32,4096,8192,8,0,23.9347,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,89.72,1423.82,0.0 +256,64,4096,8192,8,0,23.5457,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,182.41,1469.61,0.0 +256,128,4096,8192,7,0,25.4852,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,337.06,1398.91,0.0 +256,256,4096,8192,18,0,26.2984,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,653.27,1435.4,0.0 +256,512,4096,8192,18,0,41.3652,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,830.64,1013.97,0.0 +256,1024,4096,8192,0,0,59.137,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1162.04,851.1,0.0 +256,2048,4096,8192,0,0,100.2863,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1370.47,669.17,0.0 +256,4096,4096,8192,0,0,194.5669,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1412.77,517.37,0.0 +256,8192,4096,8192,0,0,356.4042,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1542.51,470.74,0.0 +256,16384,4096,8192,0,0,691.0916,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1590.98,436.98,0.0 +256,32768,4096,8192,0,0,1413.9724,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1555.21,403.42,0.0 +256,1,4608,4096,8,0,14.53,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.6,1299.91,0.0 +256,2,4608,4096,8,0,14.4825,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.21,1305.09,0.0 +256,4,4608,4096,8,0,14.6915,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.28,1288.34,0.0 +256,8,4608,4096,8,0,14.8617,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.32,1277.17,0.0 +256,16,4608,4096,8,0,13.3147,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,45.36,1433.56,0.0 +256,32,4608,4096,8,0,13.2381,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,91.25,1457.94,0.0 +256,64,4608,4096,7,0,14.4288,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,167.44,1367.15,0.0 +256,128,4608,4096,18,0,14.8087,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,326.28,1389.61,0.0 +256,256,4608,4096,18,0,22.0975,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,437.32,1008.36,0.0 +256,512,4608,4096,18,0,30.1645,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,640.73,851.67,0.0 +256,1024,4608,4096,18,0,46.7637,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,826.6,695.11,0.0 +256,2048,4608,4096,0,0,75.7426,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1020.69,609.13,0.0 +256,4096,4608,4096,0,0,121.8461,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1268.97,602.4,0.0 +256,8192,4608,4096,0,0,213.5746,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1447.91,598.98,0.0 +256,16384,4608,4096,0,0,401.978,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1538.58,589.53,0.0 +256,32768,4608,4096,0,0,812.555,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1522.3,560.06,0.0 +256,1,4096,4096,8,0,14.2119,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.36,1181.37,0.0 +256,2,4096,4096,8,0,14.3028,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,4.69,1174.72,0.0 +256,4,4096,4096,8,0,14.4725,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,9.27,1162.64,0.0 +256,8,4096,4096,8,0,14.6406,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,18.34,1152.65,0.0 +256,16,4096,4096,8,0,13.071,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.07,1298.59,0.0 +256,32,4096,4096,8,0,13.0801,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,82.09,1312.71,0.0 +256,64,4096,4096,8,0,13.035,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,164.75,1347.42,0.0 +256,128,4096,4096,7,0,14.1356,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,303.84,1298.15,0.0 +256,256,4096,4096,18,0,14.7464,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,582.51,1351.04,0.0 +256,512,4096,4096,18,0,22.456,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,765.05,1027.28,0.0 +256,1024,4096,4096,0,0,33.1894,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1035.26,884.62,0.0 +256,2048,4096,4096,0,0,57.2207,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1200.95,733.0,0.0 +256,4096,4096,4096,0,0,106.3084,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1292.83,631.27,0.0 +256,8192,4096,4096,0,0,194.5345,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1413.0,603.7,0.0 +256,16384,4096,4096,0,0,364.2854,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1509.13,598.72,0.0 +256,32768,4096,4096,0,0,719.6458,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1527.85,582.83,0.0 +256,1,2304,4096,8,0,14.6639,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.29,644.16,0.0 +256,2,2304,4096,8,0,14.7415,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.56,641.36,0.0 +256,4,2304,4096,8,0,14.7919,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.1,640.35,0.0 +256,8,2304,4096,8,0,15.0027,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.06,633.67,0.0 +256,16,2304,4096,8,0,13.4139,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.51,713.92,0.0 +256,32,2304,4096,8,0,13.4303,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,44.97,723.42,0.0 +256,64,2304,4096,8,0,13.1631,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,91.77,759.26,0.0 +256,128,2304,4096,7,0,14.3391,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,168.48,735.84,0.0 +256,256,2304,4096,18,0,14.5232,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,332.7,803.23,0.0 +256,512,2304,4096,18,0,21.4714,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,450.07,647.08,0.0 +256,1024,2304,4096,18,0,29.5816,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,653.36,620.32,0.0 +256,2048,2304,4096,18,0,46.8727,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,824.67,581.64,0.0 +256,4096,2304,4096,0,0,76.6798,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1008.21,588.01,0.0 +256,8192,2304,4096,0,0,121.6456,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1271.06,663.73,0.0 +256,16384,2304,4096,0,0,216.8886,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1425.79,701.02,0.0 +256,32768,2304,4096,0,0,420.2109,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1471.82,701.2,0.0 +256,1,4096,2048,8,0,9.3063,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.8,902.49,0.0 +256,2,4096,2048,8,0,9.5109,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.53,884.15,0.0 +256,4,4096,2048,8,0,8.735,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.68,965.03,0.0 +256,8,4096,2048,8,0,8.2411,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,16.29,1027.84,0.0 +256,16,4096,2048,7,0,8.6091,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,31.18,993.42,0.0 +256,32,4096,2048,8,0,8.7267,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,61.52,998.81,0.0 +256,64,4096,2048,8,0,8.1594,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,131.6,1108.41,0.0 +256,128,4096,2048,8,0,8.863,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,242.3,1094.36,0.0 +256,256,4096,2048,12,0,9.5238,a8w8_blockscale_1x128x128_256x32x128x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,450.97,1156.06,0.0 +256,512,4096,2048,18,0,13.288,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,646.44,1025.85,0.0 +256,1024,4096,2048,0,0,19.7473,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,869.99,955.79,0.0 +256,2048,4096,2048,0,0,32.2941,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1063.96,909.15,0.0 +256,4096,4096,2048,0,0,58.8068,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1168.56,855.88,0.0 +256,8192,4096,2048,0,0,109.497,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1255.18,842.71,0.0 +256,16384,4096,2048,0,0,202.4945,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1357.46,869.95,0.0 +256,32768,4096,2048,0,0,388.4541,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1415.24,885.39,0.0 +256,1,1280,4096,8,0,14.2811,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,0.73,367.59,0.0 +256,2,1280,4096,8,0,14.3901,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.46,365.26,0.0 +256,4,1280,4096,8,0,14.546,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.88,362.26,0.0 +256,8,1280,4096,8,0,14.7363,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.69,359.39,0.0 +256,16,1280,4096,8,0,13.0574,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.85,409.68,0.0 +256,32,1280,4096,8,0,13.1831,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.45,413.85,0.0 +256,64,1280,4096,8,0,12.7852,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,52.49,443.39,0.0 +256,128,1280,4096,8,0,12.3771,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,108.44,492.43,0.0 +256,256,1280,4096,7,0,14.1187,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,190.13,492.03,0.0 +256,512,1280,4096,18,0,14.437,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,371.87,599.21,0.0 +256,1024,1280,4096,18,0,21.1129,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,508.57,571.15,0.0 +256,2048,1280,4096,18,0,29.9846,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,716.2,629.47,0.0 +256,4096,1280,4096,2,0,48.2704,a8w8_blockscale_1x128x128_256x64x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,889.77,673.41,0.0 +256,8192,1280,4096,0,0,76.6487,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1120.69,779.78,0.0 +256,16384,1280,4096,0,0,136.4601,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1258.97,837.57,0.0 +256,32768,1280,4096,0,0,242.3387,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1417.84,921.63,0.0 +256,1,4096,1024,8,0,4.9627,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,1.69,847.02,0.0 +256,2,4096,1024,8,0,5.1334,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,3.27,820.65,0.0 +256,4,4096,1024,8,0,4.8484,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,6.92,872.69,0.0 +256,8,4096,1024,8,0,5.5786,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.03,765.07,0.0 +256,16,4096,1024,8,0,4.5013,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,29.82,964.56,0.0 +256,32,4096,1024,8,0,4.6213,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,58.09,971.42,0.0 +256,64,4096,1024,7,0,5.4239,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,98.98,882.05,0.0 +256,128,4096,1024,7,0,5.2693,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,203.77,1019.86,0.0 +256,256,4096,1024,18,0,5.954,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,360.68,1100.71,0.0 +256,512,4096,1024,18,0,9.0362,a8w8_blockscale_1x128x128_256x64x64x256_16x16_32x32_16x16x1_16x16x1_1x32x1x8_8_1x1_intrawave_v1,475.31,986.35,0.0 +256,1024,4096,1024,0,0,12.8939,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,666.2,1057.2,0.0 +256,2048,4096,1024,0,0,21.0543,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,815.98,1095.68,0.0 +256,4096,4096,1024,0,0,38.2567,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,898.14,1096.36,0.0 +256,8192,4096,1024,0,0,67.679,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1015.37,1177.5,0.0 +256,16384,4096,1024,0,0,124.7445,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1101.76,1244.06,0.0 +256,32768,4096,1024,0,0,237.0386,a8w8_blockscale_1x128x128_256x128x128x128_16x16_32x32_8x32x1_8x32x1_1x32x1x8_8_1x1_intrawave_v3,1159.63,1291.71,0.0 diff --git a/aiter/configs/model_configs/a8w8_blockscale_untuned_fmoe_qwen3_235b.csv b/aiter/configs/model_configs/a8w8_blockscale_untuned_fmoe_qwen3_235b.csv new file mode 100644 index 0000000000..d140b03d09 --- /dev/null +++ b/aiter/configs/model_configs/a8w8_blockscale_untuned_fmoe_qwen3_235b.csv @@ -0,0 +1,11 @@ +token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1 +1,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +2,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +4,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +8,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +16,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +32,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +64,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +128,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +256,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +512,4096,1536,16,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 diff --git a/aiter/configs/model_configs/a8w8_blockscale_untuned_gemm_qwen3_235b.csv b/aiter/configs/model_configs/a8w8_blockscale_untuned_gemm_qwen3_235b.csv new file mode 100644 index 0000000000..bbeb058b84 --- /dev/null +++ b/aiter/configs/model_configs/a8w8_blockscale_untuned_gemm_qwen3_235b.csv @@ -0,0 +1,129 @@ +M,N,K +1, 9216, 4096 +2, 9216, 4096 +4, 9216, 4096 +8, 9216, 4096 +16, 9216, 4096 +32, 9216, 4096 +64, 9216, 4096 +128, 9216, 4096 +256, 9216, 4096 +512, 9216, 4096 +1024, 9216, 4096 +2048, 9216, 4096 +4096, 9216, 4096 +8192, 9216, 4096 +16384, 9216, 4096 +32768, 9216, 4096 +1, 4096, 8192 +2, 4096, 8192 +4, 4096, 8192 +8, 4096, 8192 +16, 4096, 8192 +32, 4096, 8192 +64, 4096, 8192 +128, 4096, 8192 +256, 4096, 8192 +512, 4096, 8192 +1024, 4096, 8192 +2048, 4096, 8192 +4096, 4096, 8192 +8192, 4096, 8192 +16384, 4096, 8192 +32768, 4096, 8192 +1, 4608, 4096 +2, 4608, 4096 +4, 4608, 4096 +8, 4608, 4096 +16, 4608, 4096 +32, 4608, 4096 +64, 4608, 4096 +128, 4608, 4096 +256, 4608, 4096 +512, 4608, 4096 +1024, 4608, 4096 +2048, 4608, 4096 +4096, 4608, 4096 +8192, 4608, 4096 +16384, 4608, 4096 +32768, 4608, 4096 +1, 4096, 4096 +2, 4096, 4096 +4, 4096, 4096 +8, 4096, 4096 +16, 4096, 4096 +32, 4096, 4096 +64, 4096, 4096 +128, 4096, 4096 +256, 4096, 4096 +512, 4096, 4096 +1024, 4096, 4096 +2048, 4096, 4096 +4096, 4096, 4096 +8192, 4096, 4096 +16384, 4096, 4096 +32768, 4096, 4096 +1, 2304, 4096 +2, 2304, 4096 +4, 2304, 4096 +8, 2304, 4096 +16, 2304, 4096 +32, 2304, 4096 +64, 2304, 4096 +128, 2304, 4096 +256, 2304, 4096 +512, 2304, 4096 +1024, 2304, 4096 +2048, 2304, 4096 +4096, 2304, 4096 +8192, 2304, 4096 +16384, 2304, 4096 +32768, 2304, 4096 +1, 4096, 2048 +2, 4096, 2048 +4, 4096, 2048 +8, 4096, 2048 +16, 4096, 2048 +32, 4096, 2048 +64, 4096, 2048 +128, 4096, 2048 +256, 4096, 2048 +512, 4096, 2048 +1024, 4096, 2048 +2048, 4096, 2048 +4096, 4096, 2048 +8192, 4096, 2048 +16384, 4096, 2048 +32768, 4096, 2048 +1, 1280, 4096 +2, 1280, 4096 +4, 1280, 4096 +8, 1280, 4096 +16, 1280, 4096 +32, 1280, 4096 +64, 1280, 4096 +128, 1280, 4096 +256, 1280, 4096 +512, 1280, 4096 +1024, 1280, 4096 +2048, 1280, 4096 +4096, 1280, 4096 +8192, 1280, 4096 +16384, 1280, 4096 +32768, 1280, 4096 +1, 4096, 1024 +2, 4096, 1024 +4, 4096, 1024 +8, 4096, 1024 +16, 4096, 1024 +32, 4096, 1024 +64, 4096, 1024 +128, 4096, 1024 +256, 4096, 1024 +512, 4096, 1024 +1024, 4096, 1024 +2048, 4096, 1024 +4096, 4096, 1024 +8192, 4096, 1024 +16384, 4096, 1024 +32768, 4096, 1024