@@ -2446,11 +2446,12 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
24462446 [
24472447 (8 , 1 , 8 , True , True , True , "CUTLASS" , False ),
24482448 (8 , 1 , 8 , True , True , True , "TRTLLM" , False ),
2449- (8 , 1 , 8 , False , False , False , "TRTLLM" , True ),
2449+ (8 , 1 , 8 , True , True , True , "TRTLLM" , True ),
24502450 ],
24512451 ids = [
2452- "latency_moe_cutlass" , "latency_moe_trtllm" ,
2453- "latency_moe_trtllm_eagle3"
2452+ "latency_moe_cutlass" ,
2453+ "latency_moe_trtllm" ,
2454+ "latency_moe_trtllm_eagle3" ,
24542455 ],
24552456 )
24562457 def test_nvfp4 (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
@@ -2485,6 +2486,50 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
24852486 task = GSM8K (self .MODEL_NAME )
24862487 task .evaluate (llm )
24872488
2489+ @skip_pre_blackwell
2490+ @pytest .mark .skip_less_mpi_world_size (4 )
2491+ @pytest .mark .parametrize (
2492+ "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3" ,
2493+ [
2494+ (4 , 1 , 4 , False , False , False , "TRTLLM" ,
2495+ True ), # TP8 has bug when we use TRTLLM moe backend and eagle3
2496+ ],
2497+ ids = [
2498+ "latency_moe_trtllm_eagle3" ,
2499+ ],
2500+ )
2501+ def test_nvfp4_4gpus (self , tp_size , pp_size , ep_size , attention_dp ,
2502+ cuda_graph , overlap_scheduler , moe_backend , eagle3 ):
2503+
2504+ pytorch_config = dict (
2505+ disable_overlap_scheduler = not overlap_scheduler ,
2506+ cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
2507+ moe_config = MoeConfig (backend = moe_backend ))
2508+
2509+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
2510+ enable_block_reuse = not eagle3 )
2511+ spec_config = None
2512+ if eagle3 :
2513+ spec_config = EagleDecodingConfig (
2514+ max_draft_len = 2 ,
2515+ speculative_model_dir =
2516+ f"{ llm_models_root ()} /Qwen3/qwen3-235B-eagle3/" ,
2517+ eagle3_one_model = True )
2518+ with LLM (
2519+ f"{ llm_models_root ()} /Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf" ,
2520+ tensor_parallel_size = tp_size ,
2521+ pipeline_parallel_size = pp_size ,
2522+ moe_expert_parallel_size = ep_size ,
2523+ ** pytorch_config ,
2524+ enable_attention_dp = attention_dp ,
2525+ kv_cache_config = kv_cache_config ,
2526+ speculative_config = spec_config ) as llm :
2527+
2528+ task = MMLU (self .MODEL_NAME )
2529+ task .evaluate (llm )
2530+ task = GSM8K (self .MODEL_NAME )
2531+ task .evaluate (llm )
2532+
24882533
24892534class TestPhi4MiniInstruct (LlmapiAccuracyTestHarness ):
24902535 MODEL_NAME = "microsoft/Phi-4-mini-instruct"
0 commit comments