diff --git a/aiter/configs/model_configs/kimik2_fp4_tuned_fmoe.csv b/aiter/configs/model_configs/kimik2_fp4_tuned_fmoe.csv index bb16823549..37393d10f6 100644 --- a/aiter/configs/model_configs/kimik2_fp4_tuned_fmoe.csv +++ b/aiter/configs/model_configs/kimik2_fp4_tuned_fmoe.csv @@ -1,129 +1,129 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,_tag -256,1,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,13.2452,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb7_go_fp4,16.9%,6.6844,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.6%,19.9296,0,4.42,106070.91, -256,2,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,15.8944,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb4_go_fp4,17.2%,8.4755,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.3%,24.3699,0,7.23,86745.22, -256,4,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,21.3039,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb4_fp4,16.1%,12.4074,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,1.3%,33.7113,0,10.45,62709.4, -256,8,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,29.8852,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4,0.0%,17.8471,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,1.2%,47.7323,0,14.76,44290.79, -256,16,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,44.7944,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4,0.0%,27.4087,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,1.2%,72.2031,0,19.52,29282.31, -256,32,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,81.51140000000001,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3,0.0%,44.797,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,1.2%,126.3084,0,22.32,16741.7, -256,64,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,118.8141,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,67.0329,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,1.1%,185.847,0,30.33,11381.97, -256,128,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,118.5136,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4,0.0%,69.4259,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,1.2%,187.9395,0,59.99,11262.57, -256,256,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,115.8638,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_fp4,17.6%,74.2967,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,1.2%,190.1605,0,118.58,11145.5, -256,512,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,117.5612,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_fp4,17.0%,80.6277,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,1.2%,198.1889,0,227.55,10721.79, -256,1024,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,119.968,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,17.3%,102.033,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2_sbm64,1.1%,222.001,0,406.28,9621.35, -256,2048,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,132.5067,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4,0.0%,157.7576,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2,0.0%,290.2643,0,621.46,7434.5, -256,4096,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,156.74679999999998,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2,0.0%,267.2572,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.0%,424.004,0,850.88,5193.37, -256,8192,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,250.0777,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0_xcd4,0.0%,463.8763,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_sbm128,0.0%,713.954,0,1010.65,3207.62, -256,16384,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,366.0242,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0_xcd4_fp4,17.3%,926.6298,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.0%,1292.654,0,1116.39,1907.9, -256,32768,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,631.8818,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0_xcd4_fp4,17.3%,1751.4067,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist,0.0%,2383.2885,0,1211.02,1182.64, -256,1,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,16.354,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb4_go_fp4,12.7%,7.2729,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,2.6%,23.6269,0,7.46,178943.49, -256,2,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,21.5812,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb4_fp4,15.4%,12.8273,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,3.0%,34.4085,0,10.24,122873.75, -256,4,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,28.0247,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3,0.0%,17.2081,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,2.9%,45.2328,0,15.58,93470.77, -256,8,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,44.514500000000005,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_xcd4,0.0%,27.0211,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,2.8%,71.5356,0,19.7,59103.87, -256,16,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,81.495,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_xcd4,0.0%,44.4102,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2_xcd4,2.7%,125.9052,0,22.39,33582.43, -256,32,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,154.3846,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3,0.0%,80.4195,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_xcd4_persist,2.7%,234.8041,0,24.01,18008.83, -256,64,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,228.1004,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,17.2%,118.3423,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,2.8%,346.4427,0,32.54,12207.6, -256,128,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,228.1496,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3,0.0%,121.3876,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,2.7%,349.5372,0,64.51,12103.46, -256,256,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,228.5579,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,17.5%,124.7,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,2.7%,353.2579,0,127.66,11983.78, -256,512,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,232.2845,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,17.3%,136.0064,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist_sbm64,2.7%,368.2909,0,244.9,11509.57, -256,1024,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,233.5376,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_fp4,17.3%,146.6693,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,2.7%,380.2069,0,474.45,11177.8, -256,2048,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,238.474,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,17.3%,206.1031,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_bnt2_persist,2.7%,444.5771,0,811.51,9608.9, -256,4096,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,292.4502,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4,0.0%,377.921,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_persist_sbm128,0.3%,670.3712,0,1076.35,6438.13, -256,8192,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,459.2863,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_fp4,17.3%,626.7223,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist,0.3%,1086.0086,0,1328.82,4055.23, -256,16384,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,691.0762,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0%,1171.9833,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_sbm128,0.3%,1863.0595,0,1549.18,2458.42, -256,32768,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1208.457,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_fp4,17.3%,2227.9499,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.3%,3436.4069,0,1679.79,1435.37, -256,1,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,17.8501,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb7_fp4,16.7%,7.6621,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,2.4%,25.5122,0,7.77,166151.49, -256,2,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,22.8551,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb7_fp4,18.9%,13.1521,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,2.7%,36.0072,0,11.01,117724.0, -256,4,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,29.4262,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3,0.0%,19.3881,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,2.8%,48.8143,0,16.24,86838.38, -256,8,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,50.5137,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4_fp4,18.7%,29.363,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,2.7%,79.8767,0,19.85,53069.8, -256,16,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,95.7203,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w3,0.0%,49.2937,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,2.8%,145.014,0,21.87,29233.13, -256,32,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,171.3749,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3,0.0%,89.5678,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic,2.8%,260.9427,0,24.3,16247.08, -256,64,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,236.2525,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4,0.0%,120.7332,flydsl_moe2_afp4_wfp4_bf16_t16x256x256_atomic_sbm32,2.8%,356.9857,0,35.53,11877.91, -256,128,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,234.1601,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_xcd4_fp4,17.1%,122.0701,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_sbm64,2.9%,356.2302,0,71.21,11906.97, -256,256,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,236.6952,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_xcd4_fp4,17.2%,127.4914,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist_sbm64,2.9%,364.1866,0,139.31,11654.39, -256,512,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,242.1832,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,17.2%,134.7446,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,2.8%,376.9278,0,269.2,11275.05, -256,1024,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,246.3426,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,17.3%,153.0524,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,2.9%,399.395,0,508.11,10668.36, -256,2048,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,251.0842,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,17.2%,233.9762,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2_persist,2.8%,485.0604,0,836.75,8829.64, -256,4096,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,316.9738,flydsl_moe1_afp4_wfp4_bf16_t128x128x256,0.0%,400.1827,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.2%,717.1565,0,1131.9,6033.48, -256,8192,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,473.8717,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0_fp4,17.2%,670.9802,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,0.2%,1144.8519,0,1418.09,3856.42, -256,16384,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,731.1314,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0_fp4,17.3%,1313.4823,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_persist,0.2%,2044.6137,0,1588.07,2245.5, -256,32768,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1306.3543,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_fp4,17.3%,2514.8871,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_persist,0.2%,3821.2414,0,1699.45,1293.69, -256,1,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,13.3809,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb14_go_fp4,20.1%,6.8161,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.2%,20.197,0,4.91,104939.14, -256,2,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,16.4835,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb4_go_fp4,17.6%,8.4904,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.0%,24.9739,0,7.94,84867.69, -256,4,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,22.3365,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb2_go_fp4,17.3%,13.1164,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,1.2%,35.4529,0,11.18,59784.12, -256,8,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,30.8842,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4,0.0%,19.279,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,1.3%,50.1632,0,15.8,42254.21, -256,16,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,54.5127,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3,0.0%,29.3089,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,1.3%,83.8216,0,18.91,25289.17, -256,32,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,93.3433,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w3,0.0%,53.0394,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,1.3%,146.3827,0,21.66,14483.42, -256,64,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,129.68970000000002,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4_xcd4,0.0%,68.2803,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,1.2%,197.97,0,32.03,10712.79, -256,128,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,126.7838,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4_xcd4_fp4,16.6%,70.9723,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,1.2%,197.7561,0,64.14,10731.33, -256,256,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,127.3984,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_xcd4_fp4,17.0%,73.9814,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,1.2%,201.3798,0,125.97,10551.9, -256,512,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,130.7355,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,17.1%,82.6416,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,1.2%,213.3771,0,237.77,9984.41, -256,1024,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,133.9164,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,17.4%,115.6845,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,1.2%,249.6009,0,406.52,8579.51, -256,2048,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,139.5789,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,17.2%,169.6701,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2_persist,0.0%,309.249,0,656.23,6995.9, -256,4096,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,177.8189,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2,0.0%,282.0472,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.0%,459.8661,0,882.59,4800.34, -256,8192,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,284.6439,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_xcd4,0.0%,486.7326,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.0%,771.3765,0,1052.34,2975.97, -256,16384,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,440.0206,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_bnt0_xcd4,0.0%,979.1867,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.0%,1419.2073,0,1143.95,1741.65, -256,32768,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,700.9508,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4_fp4,17.3%,1957.461,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.0%,2658.4118,0,1221.4,1062.32, -256,1,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,24.6248,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,6.6844,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0156,31.3092,0,0.0,0.0,flydsl_fallback -256,2,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,24.6277,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,8.4755,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0133,33.1032,0,0.0,0.0,flydsl_fallback -256,4,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,26.1829,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,14.3766,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0138,40.5595,0,0.0,0.0,flydsl_fallback -256,8,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,30.2074,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,18.9994,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0122,49.2068,0,0.0,0.0,flydsl_fallback -256,16,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,50.1851,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,28.8427,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0119,79.0278,0,0.0,0.0,flydsl_fallback -256,32,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,91.9274,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,56.4536,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0119,148.381,0,0.0,0.0,flydsl_fallback -256,64,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,133.52890000000002,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,72.2503,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0114,205.7792,0,0.0,0.0,flydsl_fallback -256,128,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,134.37,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,75.4038,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0113,209.7738,0,0.0,0.0,flydsl_fallback -256,256,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,135.7072,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,80.5166,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0117,216.2238,0,0.0,0.0,flydsl_fallback -256,512,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,136.8202,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,88.9883,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0117,225.8085,0,0.0,0.0,flydsl_fallback -256,1024,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,141.0647,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,117.1939,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0117,258.2586,0,0.0,0.0,flydsl_fallback -256,2048,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,150.5248,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,214.4916,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0117,365.0164,0,0.0,0.0,flydsl_fallback -256,4096,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,176.7558,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,403.0595,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0117,579.8153,0,0.0,0.0,flydsl_fallback -256,8192,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,267.11560000000003,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,759.2218,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0117,1026.3374,0,0.0,0.0,flydsl_fallback -256,16384,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,437.7072,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,1455.2831,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0116,1892.9903,0,0.0,0.0,flydsl_fallback -256,32768,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,663.3669,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,2875.3131,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0116,3538.68,0,0.0,0.0,flydsl_fallback -256,1,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,24.539,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,7.2729,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0259,31.8119,0,0.0,0.0,flydsl_fallback -256,2,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,26.6509,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,14.035,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0296,40.6859,0,0.0,0.0,flydsl_fallback -256,4,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,31.2822,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,19.072,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0281,50.3542,0,0.0,0.0,flydsl_fallback -256,8,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,49.7396,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,29.5203,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0279,79.2599,0,0.0,0.0,flydsl_fallback -256,16,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,92.3538,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,48.1233,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0279,140.4771,0,0.0,0.0,flydsl_fallback -256,32,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,174.5219,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,85.8354,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0276,260.3573,0,0.0,0.0,flydsl_fallback -256,64,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,257.9019,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,125.3541,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0273,383.256,0,0.0,0.0,flydsl_fallback -256,128,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,256.3908,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,127.7442,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0273,384.135,0,0.0,0.0,flydsl_fallback -256,256,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,260.6848,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,133.543,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0273,394.2278,0,0.0,0.0,flydsl_fallback -256,512,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,263.3216,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,149.5248,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0273,412.8464,0,0.0,0.0,flydsl_fallback -256,1024,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,273.1676,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,224.9774,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0274,498.145,0,0.0,0.0,flydsl_fallback -256,2048,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,289.69550000000004,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,339.3983,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0272,629.0938,0,0.0,0.0,flydsl_fallback -256,4096,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,452.9873,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,569.9896,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0273,1022.9769,0,0.0,0.0,flydsl_fallback -256,8192,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,516.1552,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,938.8917,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0273,1455.0469,0,0.0,0.0,flydsl_fallback -256,16384,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,839.7145,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,1859.6769,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0273,2699.3914,0,0.0,0.0,flydsl_fallback -256,32768,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1469.5306,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,3517.7145,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0273,4987.2451,0,0.0,0.0,flydsl_fallback -256,1,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,25.0451,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,7.6621,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0241,32.7072,0,0.0,0.0,flydsl_fallback -256,2,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,27.1551,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,14.1879,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0284,41.343,0,0.0,0.0,flydsl_fallback -256,4,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,32.0801,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,19.8641,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0281,51.9442,0,0.0,0.0,flydsl_fallback -256,8,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,55.6951,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,31.6528,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0265,87.3479,0,0.0,0.0,flydsl_fallback -256,16,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,105.1921,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,53.563,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.028,158.7551,0,0.0,0.0,flydsl_fallback -256,32,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,199.5216,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,97.0299,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0288,296.5515,0,0.0,0.0,flydsl_fallback -256,64,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,260.8307,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,125.6864,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0284,386.5171,0,0.0,0.0,flydsl_fallback -256,128,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,266.3821,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,128.3034,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0285,394.6855,0,0.0,0.0,flydsl_fallback -256,256,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,265.6527,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,135.5647,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0286,401.2174,0,0.0,0.0,flydsl_fallback -256,512,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,269.6034,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,162.5341,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0283,432.1375,0,0.0,0.0,flydsl_fallback -256,1024,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,280.10200000000003,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,275.7737,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0284,555.8757,0,0.0,0.0,flydsl_fallback -256,2048,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,311.42830000000004,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,349.9696,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0284,661.3979,0,0.0,0.0,flydsl_fallback -256,4096,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,460.8264,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,617.8159,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0284,1078.6423,0,0.0,0.0,flydsl_fallback -256,8192,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,539.202,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,995.9833,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0284,1535.1853,0,0.0,0.0,flydsl_fallback -256,16384,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,876.0268,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,1986.4763,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0285,2862.5031,0,0.0,0.0,flydsl_fallback -256,32768,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1611.5569,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,3894.0732,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0285,5505.6301,0,0.0,0.0,flydsl_fallback -256,1,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,23.9613,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,6.8161,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0124,30.7774,0,0.0,0.0,flydsl_fallback -256,2,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,24.9598,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,8.4904,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0102,33.4502,0,0.0,0.0,flydsl_fallback -256,4,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,28.0817,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,14.2802,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0133,42.3619,0,0.0,0.0,flydsl_fallback -256,8,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,31.893,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,19.8686,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0125,51.7616,0,0.0,0.0,flydsl_fallback -256,16,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,55.979400000000005,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,31.3644,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0123,87.3438,0,0.0,0.0,flydsl_fallback -256,32,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,105.7492,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,54.6225,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.013,160.3717,0,0.0,0.0,flydsl_fallback -256,64,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,138.129,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,70.2826,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0122,208.4116,0,0.0,0.0,flydsl_fallback -256,128,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,139.39800000000002,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,76.9762,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.012,216.3742,0,0.0,0.0,flydsl_fallback -256,256,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,140.2029,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,78.7722,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0124,218.9751,0,0.0,0.0,flydsl_fallback -256,512,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,142.1878,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,92.4496,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0123,234.6374,0,0.0,0.0,flydsl_fallback -256,1024,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,148.3272,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,133.2883,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0122,281.6155,0,0.0,0.0,flydsl_fallback -256,2048,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,163.71689999999998,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,237.3456,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0123,401.0625,0,0.0,0.0,flydsl_fallback -256,4096,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,194.9509,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,451.9496,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0123,646.9005,0,0.0,0.0,flydsl_fallback -256,8192,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,288.8715,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,857.3649,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0123,1146.2364,0,0.0,0.0,flydsl_fallback -256,16384,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,463.6276,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,1723.7202,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0123,2187.3478,0,0.0,0.0,flydsl_fallback -256,32768,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,702.7786,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,3373.9364,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0123,4076.715,0,0.0,0.0,flydsl_fallback +256,1,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,13.9799,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb14_xcd4_fp4,22.5%,8.1319,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.5%,22.1118,0,3.98,95602.83, +256,2,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,17.2713,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb14_bnt0_fp4,20.6%,10.1607,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.4%,27.432,0,6.42,77062.27, +256,4,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,22.6984,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb4_bnt0_fp4,20.4%,13.2528,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,1.3%,35.9512,0,9.8,58802.36, +256,8,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,30.0262,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_bnt0,0.0%,17.6643,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic,1.3%,47.6905,0,14.78,44329.61, +256,16,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,39.0069,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4,0.0%,25.6226,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,1.3%,64.6295,0,21.81,32713.75, +256,32,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,63.889300000000006,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4,0.0%,37.0933,flydsl_moe2_afp4_wfp4_bf16_t16x256x256_atomic_bnt2_persist_sbm32,1.3%,100.9826,0,27.91,20940.41, +256,64,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,93.5036,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w3,0.0%,53.9899,flydsl_moe2_afp4_wfp4_bf16_t16x256x256_atomic_bnt2_sbm32,1.2%,147.4935,0,38.22,14341.69, +256,128,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,108.3724,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_fp4,17.2%,66.6968,flydsl_moe2_afp4_wfp4_bf16_t16x256x256_atomic_bnt2_sbm32,1.3%,175.0692,0,64.4,12090.54, +256,256,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,116.6103,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_fp4,17.1%,74.0789,flydsl_moe2_afp4_wfp4_bf16_t16x256x256_atomic_bnt2_persist_sbm32,1.2%,190.6892,0,118.25,11114.6, +256,512,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,117.5849,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,17.0%,80.9526,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,1.2%,198.5375,0,227.15,10702.96, +256,1024,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,121.0563,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,17.4%,106.3806,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2_sbm64,1.3%,227.4369,0,396.57,9391.39, +256,2048,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,144.0412,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4,0.0%,163.2069,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_sbm128,0.1%,307.2481,0,587.11,7023.54, +256,4096,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,158.9084,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3,0.0%,270.351,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.1%,429.2594,0,840.46,5129.79, +256,8192,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,252.1561,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_bnt0_xcd4,0.0%,463.9421,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_sbm128,0.1%,716.0982,0,1007.62,3198.01, +256,16384,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,397.2011,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0_xcd4,0.0%,924.8895,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.1%,1322.0906,0,1091.54,1865.42, +256,32768,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,673.6421,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0_xcd4,0.0%,1783.0066,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.1%,2456.6487,0,1174.86,1147.32, +256,1,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,17.9168,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb14_fp4,17.7%,7.5463,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,2.4%,25.4631,0,6.92,166039.48, +256,2,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,22.5887,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb2_go_fp4,18.7%,12.9583,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,2.9%,35.547,0,9.91,118938.35, +256,4,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,28.125300000000003,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3,0.0%,17.5122,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,2.5%,45.6375,0,15.44,92641.89, +256,8,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,40.4268,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_fp4,16.7%,27.7099,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_xcd4,2.7%,68.1367,0,20.68,62052.18, +256,16,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,67.7149,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_fp4,17.4%,40.7525,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,2.7%,108.4674,0,25.99,38981.32, +256,32,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,131.41219999999998,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4,0.0%,64.6228,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,2.8%,196.035,0,28.76,21570.37, +256,64,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,174.3146,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3,0.0%,92.6896,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,2.8%,267.0042,0,42.23,15839.58, +256,128,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,211.3136,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,17.5%,115.4686,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,2.8%,326.7822,0,69.0,12946.27, +256,256,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,227.1786,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_fp4,17.5%,128.1656,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,2.8%,355.3442,0,126.91,11913.42, +256,512,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,230.7154,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,17.2%,134.2382,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,2.8%,364.9536,0,247.14,11614.81, +256,1024,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,237.3056,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,17.3%,152.7537,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_sbm64,2.7%,390.0593,0,462.46,10895.47, +256,2048,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,248.2288,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,17.3%,215.1947,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_bnt2_persist,2.8%,463.4235,0,778.5,9218.13, +256,4096,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,309.7988,flydsl_moe1_afp4_wfp4_bf16_t128x128x256,0.0%,382.4107,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_persist_sbm128,0.4%,692.2095,0,1042.39,6235.02, +256,8192,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,492.2694,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_xcd4,0.0%,633.2934,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.4%,1125.5628,0,1282.12,3912.73, +256,16384,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,795.5577000000001,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,1192.7816,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_persist,0.4%,1988.3393,0,1451.57,2303.52, +256,32768,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1276.6282,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0%,2259.8079,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_persist_sbm128,0.4%,3536.4361,0,1632.27,1394.77, +256,1,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,17.8886,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb4_go_fp4,20.8%,7.5925,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,2.9%,25.4811,0,7.78,166354.28, +256,2,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,22.8149,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb7_xcd4_fp4,21.0%,13.1788,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,2.9%,35.9937,0,11.01,117768.15, +256,4,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,27.7235,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_fp4,19.7%,19.328,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic,2.9%,47.0515,0,16.85,90091.8, +256,8,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,51.6825,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4_fp4,0.0%,29.1534,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,2.7%,80.8359,0,19.61,52440.07, +256,16,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,78.1272,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3,0.0%,44.6234,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,2.9%,122.7506,0,25.83,34535.17, +256,32,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,132.1537,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4_fp4,17.8%,67.5444,flydsl_moe2_afp4_wfp4_bf16_t16x256x256_atomic_bnt2_persist_sbm32,3.0%,199.6981,0,31.76,21229.83, +256,64,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,184.367,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,17.2%,101.1521,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,2.9%,285.5191,0,44.42,14851.0, +256,128,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,219.1026,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w2_fp4,17.3%,120.1837,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,3.0%,339.2863,0,74.77,12501.6, +256,256,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,228.2939,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,17.3%,132.8644,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_sbm64,2.9%,361.1583,0,140.48,11752.11, +256,512,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,248.5047,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4,0.0%,135.3295,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2,3.0%,383.8342,0,264.36,11072.17, +256,1024,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,250.3871,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_fp4,17.2%,157.9713,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_sbm64,2.9%,408.3584,0,496.96,10434.19, +256,2048,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,265.6454,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_fp4,17.3%,229.7372,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist,0.3%,495.3826,0,819.32,8645.66, +256,4096,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,335.2709,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4,0.0%,397.6695,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.3%,732.9404,0,1107.52,5903.55, +256,8192,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,544.8151,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_xcd4,0.0%,707.078,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_sbm128,0.3%,1251.8931,0,1296.83,3526.68, +256,16384,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,862.1587,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,1305.102,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.3%,2167.2607,0,1498.2,2118.43, +256,32768,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1391.8997,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0%,2564.68,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_persist_sbm128,0.3%,3956.5797,0,1641.31,1249.44, +256,1,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,13.9802,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb14_fp4,13.9%,7.9113,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.4%,21.8915,0,4.53,96816.38, +256,2,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,17.2185,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb4_bnt0_go_fp4,15.8%,8.5243,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.2%,25.7428,0,7.7,82332.82, +256,4,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,23.2988,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb7_bnt0_fp4,17.3%,13.6377,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic,1.3%,36.9365,0,10.73,57382.81, +256,8,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,29.0089,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,18.4%,21.204,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,1.1%,50.2129,0,15.79,42212.39, +256,16,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,43.3489,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w2,0.0%,26.9591,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,1.1%,70.308,0,22.55,30149.89, +256,32,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,66.8905,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,38.8442,flydsl_moe2_afp4_wfp4_bf16_t16x256x256_atomic_bnt2_persist_sbm32,1.3%,105.7347,0,29.99,20051.34, +256,64,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,101.211,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4,0.0%,56.3305,flydsl_moe2_afp4_wfp4_bf16_t16x256x256_atomic_bnt2_sbm32,1.3%,157.5415,0,40.25,13461.92, +256,128,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,109.2599,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_fp4,17.2%,68.2535,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,1.3%,177.5134,0,71.45,11955.08, +256,256,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,115.5656,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_fp4,17.2%,74.744,flydsl_moe2_afp4_wfp4_bf16_t16x256x256_atomic_bnt2_persist_sbm32,1.4%,190.3096,0,133.29,11165.7, +256,512,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,130.7075,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_fp4,17.4%,82.6095,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2_persist,1.3%,213.317,0,237.84,9987.22, +256,1024,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,135.4983,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,17.4%,119.8506,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_persist_sbm64,1.3%,255.3489,0,397.37,8386.39, +256,2048,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,144.584,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_fp4,17.1%,171.2946,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.1%,315.8786,0,642.45,6849.07, +256,4096,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,179.2193,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2,0.0%,279.7423,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.1%,458.9616,0,884.33,4809.8, +256,8192,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,284.782,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_bnt0_xcd4,0.0%,519.6888,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.1%,804.4708,0,1009.05,2853.55, +256,16384,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,436.7742,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0_xcd4,0.0%,1024.2602,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_sbm128,0.1%,1461.0344,0,1111.2,1691.78, +256,32768,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,737.0636,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_bnt0_xcd4,0.0%,2046.4027,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_sbm128,0.1%,2783.4663,0,1166.53,1014.59, +256,1,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,25.1008,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,8.1319,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0146,33.2327,0,0.0,0.0,flydsl_fallback +256,2,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,25.9981,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,10.1607,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0142,36.1588,0,0.0,0.0,flydsl_fallback +256,4,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,26.9679,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,16.0298,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0121,42.9977,0,0.0,0.0,flydsl_fallback +256,8,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,31.3996,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,19.5128,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0127,50.9124,0,0.0,0.0,flydsl_fallback +256,16,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,44.2564,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,27.9592,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0125,72.2156,0,0.0,0.0,flydsl_fallback +256,32,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,71.8968,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,41.4555,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0131,113.3523,0,0.0,0.0,flydsl_fallback +256,64,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,104.9668,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,58.3475,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0126,163.3143,0,0.0,0.0,flydsl_fallback +256,128,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,126.3709,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,72.5589,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0128,198.9298,0,0.0,0.0,flydsl_fallback +256,256,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,135.34449999999998,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,78.3594,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0124,213.7039,0,0.0,0.0,flydsl_fallback +256,512,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,136.5402,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,89.3539,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0126,225.8941,0,0.0,0.0,flydsl_fallback +256,1024,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,144.8236,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,115.9855,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0126,260.8091,0,0.0,0.0,flydsl_fallback +256,2048,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,162.96370000000002,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,213.5719,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0125,376.5356,0,0.0,0.0,flydsl_fallback +256,4096,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,169.8057,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,403.5172,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0126,573.3229,0,0.0,0.0,flydsl_fallback +256,8192,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,304.2362,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,764.3293,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0125,1068.5655,0,0.0,0.0,flydsl_fallback +256,16384,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,498.0106,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,1476.716,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0125,1974.7266,0,0.0,0.0,flydsl_fallback +256,32768,7168,256,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,769.9171,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,3004.7704,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0125,3774.6875,0,0.0,0.0,flydsl_fallback +256,1,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,25.6052,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,7.5463,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0239,33.1515,0,0.0,0.0,flydsl_fallback +256,2,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,26.6499,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,15.3309,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0269,41.9808,0,0.0,0.0,flydsl_fallback +256,4,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,29.7861,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,19.304,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0256,49.0901,0,0.0,0.0,flydsl_fallback +256,8,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,47.689800000000005,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,28.816,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0267,76.5058,0,0.0,0.0,flydsl_fallback +256,16,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,78.2038,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,42.647,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0277,120.8508,0,0.0,0.0,flydsl_fallback +256,32,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,141.0361,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,70.653,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0281,211.6891,0,0.0,0.0,flydsl_fallback +256,64,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,199.124,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,99.0852,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0281,298.2092,0,0.0,0.0,flydsl_fallback +256,128,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,244.4832,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,123.0286,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0278,367.5118,0,0.0,0.0,flydsl_fallback +256,256,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,258.7976,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,143.752,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0276,402.5496,0,0.0,0.0,flydsl_fallback +256,512,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,267.3028,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,149.5863,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0274,416.8891,0,0.0,0.0,flydsl_fallback +256,1024,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,280.9852,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,182.9953,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0274,463.9805,0,0.0,0.0,flydsl_fallback +256,2048,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,347.36620000000005,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,270.1472,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0276,617.5134,0,0.0,0.0,flydsl_fallback +256,4096,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,481.9772,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,493.9473,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0273,975.9245,0,0.0,0.0,flydsl_fallback +256,8192,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,582.6193,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,973.3819,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0274,1556.0012,0,0.0,0.0,flydsl_fallback +256,16384,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,857.4551,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,1922.9129,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0275,2780.368,0,0.0,0.0,flydsl_fallback +256,32768,7168,512,384,8,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1498.8449,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,3782.1474,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0274,5280.9923,0,0.0,0.0,flydsl_fallback +256,1,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,25.8558,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,7.5925,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0294,33.4483,0,0.0,0.0,flydsl_fallback +256,2,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,27.7627,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,13.1788,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0292,40.9415,0,0.0,0.0,flydsl_fallback +256,4,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,32.6406,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,19.9788,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0302,52.6194,0,0.0,0.0,flydsl_fallback +256,8,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,53.4236,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,29.9109,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0288,83.3345,0,0.0,0.0,flydsl_fallback +256,16,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,88.1749,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,47.8359,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0295,136.0108,0,0.0,0.0,flydsl_fallback +256,32,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,151.64630000000002,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,73.5877,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0298,225.234,0,0.0,0.0,flydsl_fallback +256,64,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,210.0676,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,105.734,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0295,315.8016,0,0.0,0.0,flydsl_fallback +256,128,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,249.557,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,127.0834,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0298,376.6404,0,0.0,0.0,flydsl_fallback +256,256,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,258.6365,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,138.3303,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0294,396.9668,0,0.0,0.0,flydsl_fallback +256,512,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,276.2796,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,152.9675,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0296,429.2471,0,0.0,0.0,flydsl_fallback +256,1024,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,279.30060000000003,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,193.528,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0291,472.8286,0,0.0,0.0,flydsl_fallback +256,2048,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,300.9174,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,334.4923,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0291,635.4097,0,0.0,0.0,flydsl_fallback +256,4096,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,515.4662,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,553.9443,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0291,1069.4105,0,0.0,0.0,flydsl_fallback +256,8192,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,609.8722,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,1091.6879,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0291,1701.5601,0,0.0,0.0,flydsl_fallback +256,16384,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,931.2307,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,2118.819,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0291,3050.0497,0,0.0,0.0,flydsl_fallback +256,32768,7168,512,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1656.0413,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,4234.4266,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0291,5890.4679,0,0.0,0.0,flydsl_fallback +256,1,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,24.4312,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,7.9113,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0138,32.3425,0,0.0,0.0,flydsl_fallback +256,2,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,25.2559,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,8.5243,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0121,33.7802,0,0.0,0.0,flydsl_fallback +256,4,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,28.5367,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,14.3445,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0108,42.8812,0,0.0,0.0,flydsl_fallback +256,8,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,31.2631,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,19.6856,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0114,50.9487,0,0.0,0.0,flydsl_fallback +256,16,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,47.7488,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,30.0117,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0113,77.7605,0,0.0,0.0,flydsl_fallback +256,32,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,75.0572,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,42.507,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0126,117.5642,0,0.0,0.0,flydsl_fallback +256,64,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,109.6539,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,61.03,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0133,170.6839,0,0.0,0.0,flydsl_fallback +256,128,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,127.9358,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,70.973,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.013,198.9088,0,0.0,0.0,flydsl_fallback +256,256,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,134.8579,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,81.4717,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0134,216.3296,0,0.0,0.0,flydsl_fallback +256,512,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,143.2304,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,91.2423,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0132,234.4727,0,0.0,0.0,flydsl_fallback +256,1024,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,155.5109,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,127.797,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0135,283.3079,0,0.0,0.0,flydsl_fallback +256,2048,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,167.2837,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,233.5581,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0131,400.8418,0,0.0,0.0,flydsl_fallback +256,4096,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,189.3904,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,439.3833,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0132,628.7737,0,0.0,0.0,flydsl_fallback +256,8192,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,314.9939,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,845.8057,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0133,1160.7996,0,0.0,0.0,flydsl_fallback +256,16384,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,568.1025,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,1664.4928,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0133,2232.5953,0,0.0,0.0,flydsl_fallback +256,32768,7168,256,385,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,812.3858,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,3354.1041,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0132,4166.4899,0,0.0,0.0,flydsl_fallback