|
30 | 30 | import random |
31 | 31 | import time |
32 | 32 | import warnings |
33 | | -from collections.abc import AsyncGenerator, Iterable |
| 33 | +from collections.abc import Iterable |
34 | 34 | from dataclasses import dataclass |
35 | 35 | from datetime import datetime |
36 | 36 | from typing import Any, Literal, Optional |
|
73 | 73 | VisionArenaDataset, |
74 | 74 | ) |
75 | 75 | from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json |
| 76 | +from vllm.benchmarks.serve import get_request |
76 | 77 |
|
77 | 78 | MILLISECONDS_TO_SECONDS_CONVERSION = 1000 |
78 | 79 |
|
@@ -107,101 +108,6 @@ class BenchmarkMetrics: |
107 | 108 | percentiles_e2el_ms: list[tuple[float, float]] |
108 | 109 |
|
109 | 110 |
|
110 | | -def _get_current_request_rate( |
111 | | - ramp_up_strategy: Optional[Literal["linear", "exponential"]], |
112 | | - ramp_up_start_rps: Optional[int], |
113 | | - ramp_up_end_rps: Optional[int], |
114 | | - request_index: int, |
115 | | - total_requests: int, |
116 | | - request_rate: float, |
117 | | -) -> float: |
118 | | - if ( |
119 | | - ramp_up_strategy |
120 | | - and ramp_up_start_rps is not None |
121 | | - and ramp_up_end_rps is not None |
122 | | - ): |
123 | | - progress = request_index / max(total_requests - 1, 1) |
124 | | - if ramp_up_strategy == "linear": |
125 | | - increase = (ramp_up_end_rps - ramp_up_start_rps) * progress |
126 | | - return ramp_up_start_rps + increase |
127 | | - elif ramp_up_strategy == "exponential": |
128 | | - ratio = ramp_up_end_rps / ramp_up_start_rps |
129 | | - return ramp_up_start_rps * (ratio**progress) |
130 | | - else: |
131 | | - raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}") |
132 | | - return request_rate |
133 | | - |
134 | | - |
135 | | -async def get_request( |
136 | | - input_requests: list[SampleRequest], |
137 | | - request_rate: float, |
138 | | - burstiness: float = 1.0, |
139 | | - ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, |
140 | | - ramp_up_start_rps: Optional[int] = None, |
141 | | - ramp_up_end_rps: Optional[int] = None, |
142 | | -) -> AsyncGenerator[tuple[SampleRequest, float], None]: |
143 | | - """ |
144 | | - Asynchronously generates requests at a specified rate |
145 | | - with OPTIONAL burstiness and OPTIONAL ramp-up strategy. |
146 | | -
|
147 | | - Args: |
148 | | - input_requests: |
149 | | - A list of input requests, each represented as a SampleRequest. |
150 | | - request_rate: |
151 | | - The rate at which requests are generated (requests/s). |
152 | | - burstiness (optional): |
153 | | - The burstiness factor of the request generation. |
154 | | - Only takes effect when request_rate is not inf. |
155 | | - Default value is 1, which follows a Poisson process. |
156 | | - Otherwise, the request intervals follow a gamma distribution. |
157 | | - A lower burstiness value (0 < burstiness < 1) results |
158 | | - in more bursty requests, while a higher burstiness value |
159 | | - (burstiness > 1) results in a more uniform arrival of requests. |
160 | | - ramp_up_strategy (optional): |
161 | | - The ramp-up strategy. Can be "linear" or "exponential". |
162 | | - If None, uses constant request rate (specified by request_rate). |
163 | | - ramp_up_start_rps (optional): |
164 | | - The starting request rate for ramp-up. |
165 | | - ramp_up_end_rps (optional): |
166 | | - The ending request rate for ramp-up. |
167 | | - """ |
168 | | - assert burstiness > 0, ( |
169 | | - f"A positive burstiness factor is expected, but given {burstiness}." |
170 | | - ) |
171 | | - # Convert to list to get length for ramp-up calculations |
172 | | - if isinstance(input_requests, Iterable) and not isinstance(input_requests, list): |
173 | | - input_requests = list(input_requests) |
174 | | - |
175 | | - total_requests = len(input_requests) |
176 | | - request_index = 0 |
177 | | - |
178 | | - for request in input_requests: |
179 | | - current_request_rate = _get_current_request_rate( |
180 | | - ramp_up_strategy, |
181 | | - ramp_up_start_rps, |
182 | | - ramp_up_end_rps, |
183 | | - request_index, |
184 | | - total_requests, |
185 | | - request_rate, |
186 | | - ) |
187 | | - |
188 | | - yield request, current_request_rate |
189 | | - |
190 | | - request_index += 1 |
191 | | - |
192 | | - if current_request_rate == float("inf"): |
193 | | - # If the request rate is infinity, then we don't need to wait. |
194 | | - continue |
195 | | - |
196 | | - theta = 1.0 / (current_request_rate * burstiness) |
197 | | - |
198 | | - # Sample the request interval from the gamma distribution. |
199 | | - # If burstiness is 1, it follows exponential distribution. |
200 | | - interval = np.random.gamma(shape=burstiness, scale=theta) |
201 | | - # The next request will be sent after the interval. |
202 | | - await asyncio.sleep(interval) |
203 | | - |
204 | | - |
205 | 111 | def calculate_metrics( |
206 | 112 | input_requests: list[SampleRequest], |
207 | 113 | outputs: list[RequestFuncOutput], |
|
0 commit comments