From d4acfd07cc56177044af170955cd8d3aea96f48f Mon Sep 17 00:00:00 2001 From: Robin Kobus <19427718+Funatiq@users.noreply.github.com> Date: Wed, 1 Oct 2025 10:49:18 +0000 Subject: [PATCH] [None][test] Add accuracy test for Qwen3Next model - Updated L0_Test.groovy to include auto_trigger for Qwen3Next. - Added TestQwen3NextThinking class in test_llm_api_pytorch.py for accuracy testing. - Updated l0_dgx_h100.yml to include new test case for Qwen3Next under specific conditions. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> --- jenkins/L0_Test.groovy | 3 ++ .../defs/accuracy/references/gsm8k.yaml | 2 ++ .../defs/accuracy/references/mmlu.yaml | 2 ++ .../defs/accuracy/test_llm_api_pytorch.py | 31 +++++++++++++++++++ .../test_lists/test-db/l0_dgx_h100.yml | 15 +++++++++ 5 files changed, 53 insertions(+) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index c2be29794ae..154cfdc2b12 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1276,6 +1276,8 @@ def getMakoArgsFromStageName(stageName, parseSysinfo=false) { makoArgs += ["auto_trigger=deepseek"] } else if (stageName.contains("-GptOss-")) { makoArgs += ["auto_trigger=gpt_oss"] + } else if (stageName.contains("-Qwen3Next-")) { + makoArgs += ["auto_trigger=qwen3_next"] } else { makoArgs += ["auto_trigger=others"] } @@ -2048,6 +2050,7 @@ def launchTestJobs(pipeline, testFilter) "DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["dgx-h100-x4", "l0_dgx_h100", 2, 2, 4], "DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2", "l0_dgx_h100", 1, 1, 2], "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], + "DGX_H100-4_GPUs-PyTorch-Qwen3Next-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "A10-PyTorch-1": ["a10", "l0_a10", 1, 1], diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 0b82ceff1dd..098cfb276b9 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -123,6 +123,8 @@ Qwen3/Qwen3-235B-A22B: quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 85.78 +Qwen3/Qwen3-Next-80B-A3B-Thinking: + - accuracy: 81.577 moonshotai/Kimi-K2-Instruct: - quant_algo: FP8_BLOCK_SCALES accuracy: 94.84 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 430c5a18213..f70baa59313 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -229,6 +229,8 @@ Qwen3/Qwen3-235B-A22B: quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 86 +Qwen3/Qwen3-Next-80B-A3B-Thinking: + - accuracy: 86 moonshotai/Kimi-K2-Instruct: - quant_algo: FP8_BLOCK_SCALES accuracy: 87.65 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index a75af54d272..3b7a3ee2316 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -3559,6 +3559,37 @@ def test_auto_dtype_tp4(self): task.evaluate(llm) +@pytest.mark.skip_less_device_memory(80000) +class TestQwen3NextThinking(LlmapiAccuracyTestHarness): + MODEL_NAME = "Qwen3/Qwen3-Next-80B-A3B-Thinking" + MODEL_PATH = f"{llm_models_root()}/{MODEL_NAME}" + + @skip_pre_hopper + @pytest.mark.skip_less_device(4) + @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4)], + ids=["tp4ep4"]) + def test_auto_dtype(self, tp_size, pp_size, ep_size): + if get_device_count() != tp_size * pp_size: + pytest.skip("Device count mismatch with world size") + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, + enable_block_reuse=False) + cuda_graph_config = CudaGraphConfig(enable_padding=True, + max_batch_size=720) + + with LLM(self.MODEL_PATH, + max_num_tokens=4096, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + cuda_graph_config=cuda_graph_config) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + class TestNano_V2_VLM(LlmapiAccuracyTestHarness): MODEL_NAME = "nvidia/Nano-v2-VLM" MODEL_PATH = f"{llm_models_root()}/Nano-v2-VLM" diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index a07955a3c17..0b8fbfe976b 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -164,6 +164,21 @@ l0_dgx_h100: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto] +- condition: + ranges: + system_gpu_count: + gte: 4 + lte: 4 + wildcards: + gpu: + - '*h100*' + linux_distribution_name: ubuntu* + terms: + stage: pre_merge + backend: pytorch + auto_trigger: qwen3_next + tests: + - accuracy/test_llm_api_pytorch.py::TestQwen3NextThinking::test_auto_dtype[tp4ep4] - condition: ranges: system_gpu_count: