From d4acfd07cc56177044af170955cd8d3aea96f48f Mon Sep 17 00:00:00 2001
From: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
Date: Wed, 1 Oct 2025 10:49:18 +0000
Subject: [PATCH] [None][test] Add accuracy test for Qwen3Next model

- Updated L0_Test.groovy to include auto_trigger for Qwen3Next.
- Added TestQwen3NextThinking class in test_llm_api_pytorch.py for accuracy testing.
- Updated l0_dgx_h100.yml to include new test case for Qwen3Next under specific conditions.

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
---
 jenkins/L0_Test.groovy                        |  3 ++
 .../defs/accuracy/references/gsm8k.yaml       |  2 ++
 .../defs/accuracy/references/mmlu.yaml        |  2 ++
 .../defs/accuracy/test_llm_api_pytorch.py     | 31 +++++++++++++++++++
 .../test_lists/test-db/l0_dgx_h100.yml        | 15 +++++++++
 5 files changed, 53 insertions(+)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index c2be29794ae..154cfdc2b12 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -1276,6 +1276,8 @@ def getMakoArgsFromStageName(stageName, parseSysinfo=false) {
         makoArgs += ["auto_trigger=deepseek"]
     } else if (stageName.contains("-GptOss-")) {
         makoArgs += ["auto_trigger=gpt_oss"]
+    } else if (stageName.contains("-Qwen3Next-")) {
+        makoArgs += ["auto_trigger=qwen3_next"]
     } else {
         makoArgs += ["auto_trigger=others"]
     }
@@ -2048,6 +2050,7 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["dgx-h100-x4", "l0_dgx_h100", 2, 2, 4],
         "DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2", "l0_dgx_h100", 1, 1, 2],
         "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
+        "DGX_H100-4_GPUs-PyTorch-Qwen3Next-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "A10-PyTorch-1": ["a10", "l0_a10", 1, 1],
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index 0b82ceff1dd..098cfb276b9 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -123,6 +123,8 @@ Qwen3/Qwen3-235B-A22B:
     quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 85.78
+Qwen3/Qwen3-Next-80B-A3B-Thinking:
+  - accuracy: 81.577
 moonshotai/Kimi-K2-Instruct:
   - quant_algo: FP8_BLOCK_SCALES
     accuracy: 94.84
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 430c5a18213..f70baa59313 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -229,6 +229,8 @@ Qwen3/Qwen3-235B-A22B:
     quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 86
+Qwen3/Qwen3-Next-80B-A3B-Thinking:
+  - accuracy: 86
 moonshotai/Kimi-K2-Instruct:
   - quant_algo: FP8_BLOCK_SCALES
     accuracy: 87.65
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index a75af54d272..3b7a3ee2316 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -3559,6 +3559,37 @@ def test_auto_dtype_tp4(self):
             task.evaluate(llm)
 
 
+@pytest.mark.skip_less_device_memory(80000)
+class TestQwen3NextThinking(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "Qwen3/Qwen3-Next-80B-A3B-Thinking"
+    MODEL_PATH = f"{llm_models_root()}/{MODEL_NAME}"
+
+    @skip_pre_hopper
+    @pytest.mark.skip_less_device(4)
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4)],
+                             ids=["tp4ep4"])
+    def test_auto_dtype(self, tp_size, pp_size, ep_size):
+        if get_device_count() != tp_size * pp_size:
+            pytest.skip("Device count mismatch with world size")
+
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
+                                        enable_block_reuse=False)
+        cuda_graph_config = CudaGraphConfig(enable_padding=True,
+                                            max_batch_size=720)
+
+        with LLM(self.MODEL_PATH,
+                 max_num_tokens=4096,
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 kv_cache_config=kv_cache_config,
+                 cuda_graph_config=cuda_graph_config) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
 class TestNano_V2_VLM(LlmapiAccuracyTestHarness):
     MODEL_NAME = "nvidia/Nano-v2-VLM"
     MODEL_PATH = f"{llm_models_root()}/Nano-v2-VLM"
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index a07955a3c17..0b8fbfe976b 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -164,6 +164,21 @@ l0_dgx_h100:
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*h100*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: pre_merge
+      backend: pytorch
+      auto_trigger: qwen3_next
+  tests:
+  - accuracy/test_llm_api_pytorch.py::TestQwen3NextThinking::test_auto_dtype[tp4ep4]
 - condition:
     ranges:
       system_gpu_count: