Skip to content

Commit ba0a86e

Browse files
authored
[https://nvbugs/5437405][fix] qwen3 235b eagle3 ci (#7000)
Signed-off-by: bhsueh <[email protected]>
1 parent 647a526 commit ba0a86e

File tree

6 files changed

+51
-7
lines changed

6 files changed

+51
-7
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2446,11 +2446,12 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
24462446
[
24472447
(8, 1, 8, True, True, True, "CUTLASS", False),
24482448
(8, 1, 8, True, True, True, "TRTLLM", False),
2449-
(8, 1, 8, False, False, False, "TRTLLM", True),
2449+
(8, 1, 8, True, True, True, "TRTLLM", True),
24502450
],
24512451
ids=[
2452-
"latency_moe_cutlass", "latency_moe_trtllm",
2453-
"latency_moe_trtllm_eagle3"
2452+
"latency_moe_cutlass",
2453+
"latency_moe_trtllm",
2454+
"latency_moe_trtllm_eagle3",
24542455
],
24552456
)
24562457
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
@@ -2485,6 +2486,50 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
24852486
task = GSM8K(self.MODEL_NAME)
24862487
task.evaluate(llm)
24872488

2489+
@skip_pre_blackwell
2490+
@pytest.mark.skip_less_mpi_world_size(4)
2491+
@pytest.mark.parametrize(
2492+
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
2493+
[
2494+
(4, 1, 4, False, False, False, "TRTLLM",
2495+
True), # TP8 has bug when we use TRTLLM moe backend and eagle3
2496+
],
2497+
ids=[
2498+
"latency_moe_trtllm_eagle3",
2499+
],
2500+
)
2501+
def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
2502+
cuda_graph, overlap_scheduler, moe_backend, eagle3):
2503+
2504+
pytorch_config = dict(
2505+
disable_overlap_scheduler=not overlap_scheduler,
2506+
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
2507+
moe_config=MoeConfig(backend=moe_backend))
2508+
2509+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
2510+
enable_block_reuse=not eagle3)
2511+
spec_config = None
2512+
if eagle3:
2513+
spec_config = EagleDecodingConfig(
2514+
max_draft_len=2,
2515+
speculative_model_dir=
2516+
f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
2517+
eagle3_one_model=True)
2518+
with LLM(
2519+
f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
2520+
tensor_parallel_size=tp_size,
2521+
pipeline_parallel_size=pp_size,
2522+
moe_expert_parallel_size=ep_size,
2523+
**pytorch_config,
2524+
enable_attention_dp=attention_dp,
2525+
kv_cache_config=kv_cache_config,
2526+
speculative_config=spec_config) as llm:
2527+
2528+
task = MMLU(self.MODEL_NAME)
2529+
task.evaluate(llm)
2530+
task = GSM8K(self.MODEL_NAME)
2531+
task.evaluate(llm)
2532+
24882533

24892534
class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
24902535
MODEL_NAME = "microsoft/Phi-4-mini-instruct"

tests/integration/test_lists/qa/llm_function_full.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
579579
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
580580
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
581581
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
582-
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
582+
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
583583
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
584584
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
585585
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8

tests/integration/test_lists/qa/llm_function_sanity.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
116116
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
117117
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
118118
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
119-
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
119+
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
120120
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
121121
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
122122
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]

tests/integration/test_lists/test-db/l0_gb200.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,4 @@ l0_gb200:
6969
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
7070
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
7171
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
72+
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90)

tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,3 @@ l0_gb200_multi_nodes:
1919
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
2020
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
2121
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
22-
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (90)

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen
263263
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
264264
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
265265
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
266-
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405,https://nvbugs/5437384)
267266
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241)
268267
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
269268
test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)

0 commit comments

Comments
 (0)