[benchmark] Port benchmark request sent optimization to benchmark_serving (vllm-project#21209)

Jialin · npanpaliya · commit 11aeb0eaef7a · 2025-08-06T10:15:39.000-04:00
Signed-off-by: Jialin Ouyang &lt;Jialin.Ouyang@gmail.com&gt;
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -30,7 +30,7 @@
 import random
 import time
 import warnings
-from collections.abc import AsyncGenerator, Iterable
+from collections.abc import Iterable
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Literal, Optional
@@ -73,6 +73,7 @@
     VisionArenaDataset,
 )
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
+from vllm.benchmarks.serve import get_request
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
@@ -107,101 +108,6 @@ class BenchmarkMetrics:
     percentiles_e2el_ms: list[tuple[float, float]]
 
 
-def _get_current_request_rate(
-    ramp_up_strategy: Optional[Literal["linear", "exponential"]],
-    ramp_up_start_rps: Optional[int],
-    ramp_up_end_rps: Optional[int],
-    request_index: int,
-    total_requests: int,
-    request_rate: float,
-) -> float:
-    if (
-        ramp_up_strategy
-        and ramp_up_start_rps is not None
-        and ramp_up_end_rps is not None
-    ):
-        progress = request_index / max(total_requests - 1, 1)
-        if ramp_up_strategy == "linear":
-            increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
-            return ramp_up_start_rps + increase
-        elif ramp_up_strategy == "exponential":
-            ratio = ramp_up_end_rps / ramp_up_start_rps
-            return ramp_up_start_rps * (ratio**progress)
-        else:
-            raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
-    return request_rate
-
-
-async def get_request(
-    input_requests: list[SampleRequest],
-    request_rate: float,
-    burstiness: float = 1.0,
-    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
-    ramp_up_start_rps: Optional[int] = None,
-    ramp_up_end_rps: Optional[int] = None,
-) -> AsyncGenerator[tuple[SampleRequest, float], None]:
-    """
-    Asynchronously generates requests at a specified rate
-    with OPTIONAL burstiness and OPTIONAL ramp-up strategy.
-
-    Args:
-        input_requests:
-            A list of input requests, each represented as a SampleRequest.
-        request_rate:
-            The rate at which requests are generated (requests/s).
-        burstiness (optional):
-            The burstiness factor of the request generation.
-            Only takes effect when request_rate is not inf.
-            Default value is 1, which follows a Poisson process.
-            Otherwise, the request intervals follow a gamma distribution.
-            A lower burstiness value (0 < burstiness < 1) results
-            in more bursty requests, while a higher burstiness value
-            (burstiness > 1) results in a more uniform arrival of requests.
-         ramp_up_strategy (optional):
-            The ramp-up strategy. Can be "linear" or "exponential".
-            If None, uses constant request rate (specified by request_rate).
-        ramp_up_start_rps (optional):
-            The starting request rate for ramp-up.
-        ramp_up_end_rps (optional):
-            The ending request rate for ramp-up.
-    """
-    assert burstiness > 0, (
-        f"A positive burstiness factor is expected, but given {burstiness}."
-    )
-    # Convert to list to get length for ramp-up calculations
-    if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
-        input_requests = list(input_requests)
-
-    total_requests = len(input_requests)
-    request_index = 0
-
-    for request in input_requests:
-        current_request_rate = _get_current_request_rate(
-            ramp_up_strategy,
-            ramp_up_start_rps,
-            ramp_up_end_rps,
-            request_index,
-            total_requests,
-            request_rate,
-        )
-
-        yield request, current_request_rate
-
-        request_index += 1
-
-        if current_request_rate == float("inf"):
-            # If the request rate is infinity, then we don't need to wait.
-            continue
-
-        theta = 1.0 / (current_request_rate * burstiness)
-
-        # Sample the request interval from the gamma distribution.
-        # If burstiness is 1, it follows exponential distribution.
-        interval = np.random.gamma(shape=burstiness, scale=theta)
-        # The next request will be sent after the interval.
-        await asyncio.sleep(interval)
-
-
 def calculate_metrics(
     input_requests: list[SampleRequest],
     outputs: list[RequestFuncOutput],
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
@@ -179,12 +179,12 @@ async def get_request(
         delay_ts = [delay * normalize_factor for delay in delay_ts]
 
     start_ts = time.time()
-    request_index = 0
     for request_index, request in enumerate(input_requests):
-        current_ts = time.time()
-        sleep_interval_s = start_ts + delay_ts[request_index] - current_ts
-        if sleep_interval_s > 0:
-            await asyncio.sleep(sleep_interval_s)
+        if delay_ts[request_index] > 0:
+            current_ts = time.time()
+            sleep_interval_s = start_ts + delay_ts[request_index] - current_ts
+            if sleep_interval_s > 0:
+                await asyncio.sleep(sleep_interval_s)
         yield request, request_rates[request_index]