From 172823c4203972770a9ffb1c5f8f0a7647b24201 Mon Sep 17 00:00:00 2001 From: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> Date: Thu, 17 Apr 2025 20:24:37 -0700 Subject: [PATCH] Update ds v3 parameters in stress test. Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> --- .../defs/stress_test/stress_test.py | 48 +++++++++++++++++-- .../integration/test_lists/test-db/l0_a10.yml | 4 +- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py index e9b4cc1f2a1..9c9482fe492 100644 --- a/tests/integration/defs/stress_test/stress_test.py +++ b/tests/integration/defs/stress_test/stress_test.py @@ -56,7 +56,7 @@ # [sys.executable, "-m", "pip", "install", "-r", requirements_file]) # Define a constant for process termination timeouts -GRACEFUL_TERMINATION_TIMEOUT = 10 # seconds - set longer when stress large model +GRACEFUL_TERMINATION_TIMEOUT = 300 # seconds - set longer when stress large model @dataclass(frozen=True) @@ -384,7 +384,34 @@ def stress_test(config, test_mode, server_config=None): ) # Define test configurations - performance_config = PerformanceParams() if run_performance else None + performance_config = None + if run_performance: + performance_config = PerformanceParams() + + # For ds v3 specific parameters + if "DeepSeek-V3" in config.model_dir: + performance_config = PerformanceParams( + test_timeout= + 36000 # 10 hours for ds v3, change this value if needed + ) + + # For ds v3 specific server parameters + if "DeepSeek-V3" in config.model_dir: + test_server_config = ServerConfig( + port=test_server_config.port, + host=test_server_config.host, + pp_size=test_server_config.pp_size, + ep_size=8, # DeepSeek-V3 specific ep_size + max_batch_size=161, # DeepSeek-V3 specific max_batch_size + max_num_tokens=1160, # DeepSeek-V3 specific max_num_tokens + kv_cache_free_gpu_memory_fraction= + 0.7, # DeepSeek-V3 specific kv_cache fraction + capacity_scheduler_policy=test_server_config. + capacity_scheduler_policy, + wait_interval=test_server_config.wait_interval, + max_wait_seconds=7200, # DeepSeek-V3 specific wait time (2 hours) + health_check_timeout=test_server_config.health_check_timeout) + stress_config = StressTestConfig( model_config=config, server_config=test_server_config) if run_stress else None @@ -405,7 +432,7 @@ def stress_test(config, test_mode, server_config=None): if not os.path.exists(model_path): raise RuntimeError(f"Model path does not exist: {model_path}") - # Create a temporary YAML file for 'capacity_scheduler_policy' + # Create a temporary YAML file for extra_llm_options extra_llm_options = { "scheduler_config": { "capacity_scheduler_policy": @@ -413,6 +440,21 @@ def stress_test(config, test_mode, server_config=None): } } + # Add DeepSeek-V3 specific configuration + if "DeepSeek-V3" in config.model_dir: + + extra_llm_options["enable_attention_dp"] = True + + if config.backend == "pytorch": + extra_llm_options["pytorch_backend_config"] = { + "use_cuda_graph": True, + "cuda_graph_padding_enabled": True, + "cuda_graph_batch_sizes": + [1, 2, 4, 8, 16, 32, 64, 128, 256, 384], + "print_iter_log": True, + "enable_overlap_scheduler": True + } + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as temp_file: yaml.dump(extra_llm_options, temp_file) diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index e867c4a6cc7..15f410406c3 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -19,7 +19,7 @@ l0_a10: - disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] - disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0] - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-MAX_UTILIZATION-pytorch-stress-test] - - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-pytorch-stress-stage-alone] + - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-pytorch-stress-test] - condition: ranges: system_gpu_count: @@ -111,7 +111,7 @@ l0_a10: - examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-enable_gemm_plugin] - examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-enable_gemm_plugin] # 3 mins - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-MAX_UTILIZATION-trt-stress-test] - - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-trt-stress-stage-alone] + - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-trt-stress-test] - condition: ranges: system_gpu_count: