diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index e2bca37c511..f8b03c1fc83 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -138,6 +138,11 @@ Qwen3/Qwen3-235B-A22B: accuracy: 85.78 Qwen3/Qwen3-Next-80B-A3B-Thinking: - accuracy: 81.577 +Qwen3/Qwen3-Next-80B-A3B-Instruct: + - accuracy: 92.72 + - quant_algo: NVFP4 + kv_cache_quant_algo: FP8 + accuracy: 90.86 moonshotai/Kimi-K2-Instruct: - quant_algo: FP8_BLOCK_SCALES accuracy: 94.84 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index cec56c28c47..eef6b9ffa2d 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -242,6 +242,11 @@ Qwen3/Qwen3-235B-A22B: accuracy: 86 Qwen3/Qwen3-Next-80B-A3B-Thinking: - accuracy: 86 +Qwen3/Qwen3-Next-80B-A3B-Instruct: + - accuracy: 86.03 + - quant_algo: NVFP4 + kv_cache_quant_algo: FP8 + accuracy: 85.08 moonshotai/Kimi-K2-Instruct: - quant_algo: FP8_BLOCK_SCALES accuracy: 87.65 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index af38a021c74..661107c282a 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -4136,6 +4136,86 @@ def test_auto_dtype(self, tp_size, pp_size, ep_size): task.evaluate(llm) +@skip_pre_hopper +@pytest.mark.skip_less_device_memory(80000) +class TestQwen3NextInstruct(LlmapiAccuracyTestHarness): + MODEL_PATH = f"{llm_models_root()}/Qwen3-Next" + MODEL_NAME = "Qwen3/Qwen3-Next-80B-A3B-Instruct" + + # Default setting of `256` is too small + GSM8K_MAX_OUTPUT_LEN = 512 + + @pytest.mark.skip_less_device(4) + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size,cuda_graph,overlap_scheduler", + [ + (4, 1, 4, True, True), + ], + ids=[ + "tp4ep4_cudagraph_overlap", + ], + ) + def test_bf16_4gpu(self, tp_size, pp_size, ep_size, cuda_graph, + overlap_scheduler, mocker): + model_path = f"{self.MODEL_PATH}/Qwen3-Next-80B-A3B-Instruct" + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, + enable_block_reuse=False) + pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig( + max_batch_size=512) if cuda_graph else None) + + with LLM( + model_path, + tensor_parallel_size=tp_size, + max_num_tokens=16384, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + **pytorch_config, + ) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", + self.GSM8K_MAX_OUTPUT_LEN) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + @skip_pre_blackwell + @pytest.mark.skip_less_device(4) + @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM"], + ids=["cutlass", "trtllm"]) + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size,cuda_graph,overlap_scheduler", + [(1, 1, 1, True, True), (4, 1, 1, True, True), (4, 1, 4, True, True), + (4, 1, 4, False, False)], + ids=["tp1", "tp4ep1", "tp4ep4", "no_cuda_graph_overlap"]) + def test_nvfp4(self, moe_backend, tp_size, pp_size, ep_size, cuda_graph, + overlap_scheduler, mocker): + model_path = f"{self.MODEL_PATH}/qwen3-next-80b-instruct-nvfp4-ptq-fp8kv" + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, + enable_block_reuse=False) + pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig( + max_batch_size=512) if cuda_graph else None) + moe_config = MoeConfig(backend=moe_backend) + + with LLM(model_path, + tensor_parallel_size=tp_size, + max_num_tokens=16384, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + **pytorch_config, + moe_config=moe_config) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", + self.GSM8K_MAX_OUTPUT_LEN) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + class TestSeedOss_36B(LlmapiAccuracyTestHarness): MODEL_NAME = "ByteDance-Seed/Seed-OSS-36B-Instruct" MODEL_PATH = f"{llm_models_root()}/Seed-OSS/Seed-OSS-36B-Instruct" diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index e51321d5b01..79527ee6f31 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -54,6 +54,11 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551 - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B] - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index c44ce8bbfc1..11f30e681a1 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -42,6 +42,12 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] ISOLATION - accuracy/test_llm_api_pytorch.py::TestQwen3NextThinking::test_auto_dtype[tp4ep4] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300.yml b/tests/integration/test_lists/test-db/l0_dgx_b300.yml index 749f032fedf..ecb0ec976f0 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b300.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b300.yml @@ -58,6 +58,11 @@ l0_dgx_b300: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] - accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 9a875ff99d8..7e50c6ebf88 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -93,6 +93,7 @@ l0_dgx_h100: - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[GSM8K] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[MMLU] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap] - disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin] - disaggregated/test_auto_scaling.py::test_service_discovery[etcd-load_balancing] - disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml index efacea5814d..d0e45202d9f 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml @@ -33,6 +33,7 @@ l0_dgx_h200: - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] - accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] - accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap] - disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0] - unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml index 3898b101ea3..9c8d442401a 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml @@ -43,6 +43,11 @@ l0_gb200_multi_gpus: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[fp8] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml index 786b03e0e38..ececeab4ae8 100644 --- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml +++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml @@ -43,6 +43,8 @@ l0_rtx_pro_6000: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4 - accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8 + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] - condition: ranges: