From d5b0b4584236a4895ec54dd7a5a4c0345dbfe42f Mon Sep 17 00:00:00 2001 From: "Xin He (SW-GPU)" <200704525+xinhe-nv@users.noreply.github.com> Date: Mon, 11 Aug 2025 15:21:14 +0800 Subject: [PATCH] improve hang tests Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com> --- .../defs/accuracy/test_disaggregated_serving.py | 8 +++++++- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 5 +++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index e0801302eba..3044c6e07dc 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -505,11 +505,12 @@ def test_guided_decoding_with_eagle3(self, backend: str, mocker): task = JsonModeEval(self.MODEL_NAME) task.evaluate(llm) - @pytest.mark.skip_less_device(2) @pytest.mark.parametrize("tp,pp", [(1, 2), (2, 1), (2, 2)], ids=["tp1pp2", "tp2pp1", "tp2pp2"]) @pytest.mark.parametrize("testset", ["GSM8K", "MMLU"]) def test_tp_pp_symmetric(self, tp, pp, testset): + if tp * pp * 2 > get_device_count(): + pytest.skip(f"Not enough devices for tp={tp}*pp={pp} test") return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, pp, tp, pp, tp, get_accuracy_task(testset)) @@ -517,6 +518,9 @@ def test_tp_pp_symmetric(self, tp, pp, testset): @parametrize_with_ids("gen_tp", [1, 2]) @pytest.mark.parametrize("testset", ["GSM8K", "MMLU"]) def test_ctx_pp_gen_tp_asymmetric(self, ctx_pp, gen_tp, testset): + if ctx_pp * gen_tp * 2 > get_device_count(): + pytest.skip( + f"Not enough devices for ctx_pp={ctx_pp}*gen_tp={gen_tp} test") return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, ctx_pp, 1, 1, gen_tp, get_accuracy_task(testset)) @@ -527,6 +531,7 @@ class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct" MODEL_PATH = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct" + @pytest.mark.skip_less_device(8) @pytest.mark.parametrize("overlap_scheduler", [False, True]) def test_auto_dtype(self, overlap_scheduler): ctx_server_config = {"disable_overlap_scheduler": True} @@ -565,6 +570,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite" MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16" + @pytest.mark.skip_less_device(8) @parametrize_with_ids("overlap_scheduler", [True, False]) @parametrize_with_ids("mtp_nextn", [0, pytest.param(2, marks=skip_pre_hopper)]) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 256b31654e1..ab329f11800 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -250,7 +250,7 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model): enable_padding=True), ) kv_cache_config = KvCacheConfig( - enable_block_reuse=True + enable_block_reuse=True, free_gpu_memory_fraction=0.8 ) # both one-model and two-model supports this feature eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B" @@ -280,7 +280,8 @@ def test_ngram(self): cuda_graph_config=CudaGraphConfig(batch_sizes=[1]), ) - kv_cache_config = KvCacheConfig(enable_block_reuse=False) + kv_cache_config = KvCacheConfig(enable_block_reuse=False, + free_gpu_memory_fraction=0.8) spec_config = NGramDecodingConfig( max_draft_len=4,