From 3ac29fbc06cb8462a33c1ac600750bd47ff042ba Mon Sep 17 00:00:00 2001 From: guozr Date: Tue, 3 Feb 2026 16:08:15 +0800 Subject: [PATCH] [CI] add long test Signed-off-by: guozr --- .../2-cards/test_offline_inference_distributed.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py index 77e28ece331..8599002530f 100644 --- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py @@ -245,14 +245,19 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model): @patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"}) @patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"}) def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep(): - example_prompts = [ - "Hello, my name is", + short_example_prompts = [ + "Hello ", ] - max_tokens = 5 + # "max_position_embeddings": 163840, + long_example_prompts = [ + "Hello " * (163839 - 500) + "Hello" + ] + max_tokens = 500 with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning", tensor_parallel_size=2, quantization="ascend", enable_expert_parallel=True, + max_model_len=163840, compilation_config={ "cudagraph_capture_sizes": [3, 6, 9, 12], "cudagraph_mode": "FULL_DECODE_ONLY" @@ -266,7 +271,8 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep(): }, reasoning_parser="deepseek_v3", tokenizer_mode="deepseek_v32") as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) + vllm_model.generate_greedy(short_example_prompts, max_tokens) + vllm_model.generate_greedy(long_example_prompts, max_tokens) @pytest.mark.parametrize("model", QWEN_W4A4_MODELS)