NVIDIA · bo-nv · Nov 5, 2025 · Nov 3, 2025 · Nov 3, 2025 · Nov 4, 2025
@@ -985,7 +985,7 @@ def test_chunked_prefill(self):
 
 
 @skip_pre_blackwell
-@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT)
+@pytest.mark.timeout(3600)
 class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
     FP4_MODEL = f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf"
     FP8_MODEL = f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf"

@@ -1289,8 +1289,7 @@ def run_disaggregated_benchmark(example_dir,
                                 random_input_len=16,
                                 random_output_len=64,
                                 num_prompts=100,
-                                max_concurrency=32,
-                                skip_warmup=False):
+                                max_concurrency=32):
     """Run disaggregated test with given configuration."""
     run_env = env.copy()
     run_env["UCX_TLS"] = "^ib"
@@ -1320,7 +1319,7 @@ def run_disaggregated_benchmark(example_dir,
                       stderr=subprocess.STDOUT,
                       env=run_env,
                       cwd=cwd) as server_proc):
-            # Ensure the sever has started
+
             client_dir = f"{example_dir}/clients"
             client_cmd = [
                 'python3', f'{client_dir}/disagg_client.py', '-c',
@@ -1329,7 +1328,7 @@ def run_disaggregated_benchmark(example_dir,
                 '--server-start-timeout',
                 str(server_start_timeout)
             ]
-            # Warm up
+            # Ensure the sever has started and workers are ready
             check_call(client_cmd,
                        env=env,
                        poll_procs=[workers_proc, server_proc])
@@ -1366,9 +1365,6 @@ def run_disaggregated_benchmark(example_dir,
                 '--percentile-metrics',
                 'e2el,ttft',
             ]
-            # warm up
-            if not skip_warmup:
-                check_call(benchmark_cmd, env=env)
             output = check_output(benchmark_cmd, env=env)
             e2el_pattern = r"Median E2EL \(ms\):\s*(\d+\.?\d*)"
             ttft_pattern = r"Median TTFT \(ms\):\s*(\d+\.?\d*)"
@@ -1513,8 +1509,7 @@ def test_disaggregated_deepseek_v3_lite_bf16_empty_batch(
         num_prompts=10,
         max_concurrency=10,
         random_input_len=384,
-        random_output_len=1536,
-        skip_warmup=True)
+        random_output_len=1536)
     print(f"E2EL: {e2el} ms, TTFT: {ttft} ms")
 
     assert e2el > 0 and ttft > 0
@@ -338,7 +338,6 @@ triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-te
 triton_server/test_triton_llm.py::test_llmapi_backend[1-0-disableDecoupleMode-tensorrt_llm] SKIP (https://nvbugs/5461874)
 triton_server/test_triton_llm.py::test_llmapi_backend[1-0-enableDecoupleMode-tensorrt_llm] SKIP (https://nvbugs/5461874)
 cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5601670)
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_empty_batch[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5601682)
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5587574)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-fp8] SKIP (https://nvbugs/5608790)
 full:H20-3e/accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (slow I/O)