From 172823c4203972770a9ffb1c5f8f0a7647b24201 Mon Sep 17 00:00:00 2001
From: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
Date: Thu, 17 Apr 2025 20:24:37 -0700
Subject: [PATCH] Update ds v3 parameters in stress test.

Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
---
 .../defs/stress_test/stress_test.py           | 48 +++++++++++++++++--
 .../integration/test_lists/test-db/l0_a10.yml |  4 +-
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
index e9b4cc1f2a1..9c9482fe492 100644
--- a/tests/integration/defs/stress_test/stress_test.py
+++ b/tests/integration/defs/stress_test/stress_test.py
@@ -56,7 +56,7 @@
 #         [sys.executable, "-m", "pip", "install", "-r", requirements_file])
 
 # Define a constant for process termination timeouts
-GRACEFUL_TERMINATION_TIMEOUT = 10  # seconds - set longer when stress large model
+GRACEFUL_TERMINATION_TIMEOUT = 300  # seconds - set longer when stress large model
 
 
 @dataclass(frozen=True)
@@ -384,7 +384,34 @@ def stress_test(config, test_mode, server_config=None):
     )
 
     # Define test configurations
-    performance_config = PerformanceParams() if run_performance else None
+    performance_config = None
+    if run_performance:
+        performance_config = PerformanceParams()
+
+        # For ds v3 specific parameters
+        if "DeepSeek-V3" in config.model_dir:
+            performance_config = PerformanceParams(
+                test_timeout=
+                36000  # 10 hours for ds v3, change this value if needed
+            )
+
+    # For ds v3 specific server parameters
+    if "DeepSeek-V3" in config.model_dir:
+        test_server_config = ServerConfig(
+            port=test_server_config.port,
+            host=test_server_config.host,
+            pp_size=test_server_config.pp_size,
+            ep_size=8,  # DeepSeek-V3 specific ep_size
+            max_batch_size=161,  # DeepSeek-V3 specific max_batch_size
+            max_num_tokens=1160,  # DeepSeek-V3 specific max_num_tokens
+            kv_cache_free_gpu_memory_fraction=
+            0.7,  # DeepSeek-V3 specific kv_cache fraction
+            capacity_scheduler_policy=test_server_config.
+            capacity_scheduler_policy,
+            wait_interval=test_server_config.wait_interval,
+            max_wait_seconds=7200,  # DeepSeek-V3 specific wait time (2 hours)
+            health_check_timeout=test_server_config.health_check_timeout)
+
     stress_config = StressTestConfig(
         model_config=config,
         server_config=test_server_config) if run_stress else None
@@ -405,7 +432,7 @@ def stress_test(config, test_mode, server_config=None):
     if not os.path.exists(model_path):
         raise RuntimeError(f"Model path does not exist: {model_path}")
 
-    # Create a temporary YAML file for 'capacity_scheduler_policy'
+    # Create a temporary YAML file for extra_llm_options
     extra_llm_options = {
         "scheduler_config": {
             "capacity_scheduler_policy":
@@ -413,6 +440,21 @@ def stress_test(config, test_mode, server_config=None):
         }
     }
 
+    # Add DeepSeek-V3 specific configuration
+    if "DeepSeek-V3" in config.model_dir:
+
+        extra_llm_options["enable_attention_dp"] = True
+
+        if config.backend == "pytorch":
+            extra_llm_options["pytorch_backend_config"] = {
+                "use_cuda_graph": True,
+                "cuda_graph_padding_enabled": True,
+                "cuda_graph_batch_sizes":
+                [1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
+                "print_iter_log": True,
+                "enable_overlap_scheduler": True
+            }
+
     with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
                                      delete=False) as temp_file:
         yaml.dump(extra_llm_options, temp_file)
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
index e867c4a6cc7..15f410406c3 100644
--- a/tests/integration/test_lists/test-db/l0_a10.yml
+++ b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -19,7 +19,7 @@ l0_a10:
   - disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0]
   - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-MAX_UTILIZATION-pytorch-stress-test]
-  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-pytorch-stress-stage-alone]
+  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-pytorch-stress-test]
 - condition:
     ranges:
       system_gpu_count:
@@ -111,7 +111,7 @@ l0_a10:
   - examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-enable_gemm_plugin]
   - examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-enable_gemm_plugin] # 3 mins
   - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-MAX_UTILIZATION-trt-stress-test]
-  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-trt-stress-stage-alone]
+  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-trt-stress-test]
 - condition:
     ranges:
       system_gpu_count: