Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/scripts/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,5 @@ e2e-multicard-4-cards:
estimated_time: 1391
- name: tests/e2e/multicard/4-cards/test_pipeline_parallel.py
estimated_time: 679
- name: tests/e2e/multicard/4-cards/test_profiling_chunk_performance.py
estimated_time: 1300
82 changes: 82 additions & 0 deletions tests/e2e/multicard/4-cards/test_profiling_chunk_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Performance guard for profiling-based dynamic chunk sizing (PP scenario).

Measures Time-To-First-Token (TTFT) on 64k-token prefill requests with
profiling_chunk_config enabled. The test runs against
DeepSeek-V2-Lite-Chat served with PP=2, TP=2 (4 NPU cards total).

Test flow:
1. Create an LLM engine with profiling_chunk_config enabled.
2. Run NUM_WARMUP sequential requests (64k tokens, max_tokens=1) to warm
up both the NPU and the profiling predictor.
3. Run NUM_TEST sequential requests, recording TTFT for each.
4. Assert that the median TTFT does not exceed BASELINE_TTFT_S seconds.
"""

import os
import statistics
import time

from tests.e2e.conftest import VllmRunner

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
os.environ["VLLM_ASCEND_ENABLE_FLASHCOMM1"] = "1"

MODEL = "Qwen/Qwen3-30B-A3B"

# ~64k tokens
_WORD = "hello "
INPUT_64K_TOKENS = _WORD * (384_000 // len(_WORD))

NUM_WARMUP = 5
NUM_TEST = 5

# NOTE: Any changes to this baseline must be approved by team members.
# Measured on Qwen3-30B-A3B, PP=2, TP=2, 64k prefill, profiling_chunk enabled.
BASELINE_TTFT_S = 5.2


def test_profiling_chunk_ttft_performance() -> None:
with VllmRunner(
MODEL,
max_model_len=70000,
tensor_parallel_size=2,
pipeline_parallel_size=2,
block_size=128,
enable_expert_parallel=True,
enable_prefix_caching=False,
gpu_memory_utilization=0.9,
max_num_batched_tokens=12288,
distributed_executor_backend="mp",
enforce_eager=True,
async_scheduling=False,
additional_config={"profiling_chunk_config": {"enabled":True, "smooth_factor":0.9}, "enable_cpu_binding": False},
hf_overrides={"rope_parameters": {"rope_type":"yarn","rope_theta":1000,"factor":5,"original_max_position_embeddings":262144}}
) as vllm_model:
# With max_tokens=1, total latency ≈ prefill time ≈ TTFT
prompts = [INPUT_64K_TOKENS]

# ── Warmup ──────────────────────────────────────────────────────────
for _ in range(NUM_WARMUP):
vllm_model.generate_greedy(prompts, max_tokens=1)

# ── Measurement ─────────────────────────────────────────────────────
ttfts: list[float] = []
for _ in range(NUM_TEST):
start = time.perf_counter()
vllm_model.generate_greedy(prompts, max_tokens=1)
ttfts.append(time.perf_counter() - start)

median_ttft = statistics.median(ttfts)
ttft_str = ", ".join(f"{t:.2f}s" for t in ttfts)
print(
f"\n[profiling_chunk perf] TTFT per request: [{ttft_str}]"
f"\n[profiling_chunk perf] Median TTFT: {median_ttft:.2f}s "
f"(baseline: {BASELINE_TTFT_S}s)"
)

assert median_ttft <= BASELINE_TTFT_S, (
f"TTFT performance regression: median TTFT {median_ttft:.2f}s "
f"exceeds baseline {BASELINE_TTFT_S}s. "
f"Individual TTFTs: [{ttft_str}]"
)
Loading
Loading