Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/profiler/profile_sla.py
Original file line number Diff line number Diff line change
Expand Up @@ -815,10 +815,10 @@ async def run_profile(args):
"--osl", type=int, default=500, help="target output sequence length"
)
parser.add_argument(
"--ttft", type=int, default=50, help="target Time To First Token in ms"
"--ttft", type=float, default=50.0, help="target Time To First Token in ms"
)
parser.add_argument(
"--itl", type=int, default=10, help="target Inter Token Latency in ms"
"--itl", type=float, default=10.0, help="target Inter Token Latency in ms"
)

# arguments used for interpolating TTFT and ITL under different ISL/OSL
Expand Down
4 changes: 2 additions & 2 deletions components/src/dynamo/planner/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ class SLAPlannerDefaults(BasePlannerDefaults):
profile_results_dir = "profiling_results"
isl = 3000 # in number of tokens
osl = 150 # in number of tokens
ttft = 0.5 # in seconds
itl = 0.05 # in seconds
ttft = 500.0 # in milliseconds
itl = 50.0 # in milliseconds
load_predictor = "arima" # ["constant", "arima", "prophet"]
load_prediction_window_size = 50 # predict load using how many recent load samples
no_correction = False # disable correction factor, might be useful under some conditions like long cold start time
Expand Down
22 changes: 10 additions & 12 deletions components/src/dynamo/planner/utils/perf_interpolation.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def __init__(
try:
with np.load(prefill_npz_fn) as raw_data:
self.prefill_isl = raw_data["prefill_isl"]
self.prefill_ttft = (
raw_data["prefill_ttft"] / 1000
) # convert ms to s
self.prefill_ttft = raw_data["prefill_ttft"] # in milliseconds
self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"]
except FileNotFoundError:
logger.error(
Expand All @@ -64,7 +62,7 @@ def __init__(

elif raw_data:
self.prefill_isl = raw_data["prefill_isl"]
self.prefill_ttft = raw_data["prefill_ttft"] / 1000 # convert ms to s
self.prefill_ttft = raw_data["prefill_ttft"] # in milliseconds
self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"]
else:
raise ValueError("Either profile_results_dir or raw_data must be provided")
Expand Down Expand Up @@ -150,7 +148,7 @@ def __init__(
method="nearest",
)
self.itl_interpolator[nan_mask] = itl_nearest[nan_mask]
self.itl_interpolator /= 1000 # convert ms to s
# ITL values are in milliseconds

self.thpt_interpolator = scipy.interpolate.griddata(
(self.x_kv_usage, self.y_context_length),
Expand Down Expand Up @@ -230,12 +228,12 @@ def find_best_throughput_per_gpu(
parser.add_argument("--profile_results_dir", type=str, required=True)
parser.add_argument("--isl", type=int, default=3000)
parser.add_argument("--osl", type=int, default=150)
parser.add_argument("--ttft", type=float, default=0.1, help="in s")
parser.add_argument("--itl", type=float, default=0.01, help="in s")
parser.add_argument("--ttft", type=float, default=100.0, help="in milliseconds")
parser.add_argument("--itl", type=float, default=10.0, help="in milliseconds")
args = parser.parse_args()

print(f"ISL={args.isl}, OSL={args.osl}")
print(f"TTFT={args.ttft}s, ITL={args.itl}s")
print(f"TTFT={args.ttft}ms, ITL={args.itl}ms")
print(f"Using profile results from {args.profile_results_dir}")
print("")

Expand All @@ -248,11 +246,11 @@ def find_best_throughput_per_gpu(

if est_ttft <= args.ttft:
print(
f"\tEstimated TTFT={est_ttft:.3f}s <= target TTFT={args.ttft:.3f}s. Requests can queue {args.ttft - est_ttft:.3f}s maximally while meeting TTFT SLA."
f"\tEstimated TTFT={est_ttft:.2f}ms <= target TTFT={args.ttft:.2f}ms. Requests can queue {args.ttft - est_ttft:.2f}ms maximally while meeting TTFT SLA."
)
else:
print(
f"\tEstimated TTFT={est_ttft:.3f}s > target TTFT={args.ttft:.3f}s. Cannot meet TTFT SLA."
f"\tEstimated TTFT={est_ttft:.2f}ms > target TTFT={args.ttft:.2f}ms. Cannot meet TTFT SLA."
)

print(
Expand All @@ -274,12 +272,12 @@ def find_best_throughput_per_gpu(
) = decode_interpolator.find_best_throughput_per_gpu(args.itl, context_length)
if est_itl <= args.itl:
print(
f"\tEstimated ITL={est_itl:.4f}s <= target ITL={args.itl:.4f}s at {est_kv_usage*100:.2f}% active kv usage."
f"\tEstimated ITL={est_itl:.2f}ms <= target ITL={args.itl:.2f}ms at {est_kv_usage*100:.2f}% active kv usage."
)
print(
f"\tEstimated throughput: {est_thpt_per_gpu:.2f} token/s/gpu. Request rate at {est_thpt_per_gpu / args.osl:.2f} requests/s will saturate one GPU."
)
else:
print(
f"\tEstimated ITL={est_itl:.4f}s > target ITL={args.itl:.4f}s. Cannot meet ITL SLA."
f"\tEstimated ITL={est_itl:.2f}ms > target ITL={args.itl:.2f}ms. Cannot meet ITL SLA."
)
7 changes: 5 additions & 2 deletions components/src/dynamo/planner/utils/planner_argparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,13 @@ def create_sla_planner_parser() -> argparse.ArgumentParser:
"--ttft",
type=float,
default=SLAPlannerDefaults.ttft,
help="Time to first token",
help="Time to first token (float, in milliseconds)",
)
parser.add_argument(
"--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency"
"--itl",
type=float,
default=SLAPlannerDefaults.itl,
help="Inter-token latency (float, in milliseconds)",
)
parser.add_argument(
"--load-predictor",
Expand Down
21 changes: 14 additions & 7 deletions components/src/dynamo/planner/utils/planner_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,13 +249,20 @@ async def observe_metrics(self):
self.num_p_workers_gauge.set(len(self.p_endpoints))
self.num_d_workers_gauge.set(len(self.d_endpoints))

self.last_metrics.ttft = self.prometheus_api_client.get_avg_time_to_first_token(
f"{self.args.adjustment_interval}s",
self.model_name,
# Prometheus returns seconds, convert to milliseconds
self.last_metrics.ttft = (
self.prometheus_api_client.get_avg_time_to_first_token(
f"{self.args.adjustment_interval}s",
self.model_name,
)
* 1000
)
self.last_metrics.itl = self.prometheus_api_client.get_avg_inter_token_latency(
f"{self.args.adjustment_interval}s",
self.model_name,
self.last_metrics.itl = (
self.prometheus_api_client.get_avg_inter_token_latency(
f"{self.args.adjustment_interval}s",
self.model_name,
)
* 1000
)
self.last_metrics.num_req = self.prometheus_api_client.get_avg_request_count(
f"{self.args.adjustment_interval}s",
Expand Down Expand Up @@ -284,7 +291,7 @@ async def observe_metrics(self):
f"Observed num_req: {self.last_metrics.num_req:.2f} isl: {self.last_metrics.isl:.2f} osl: {self.last_metrics.osl:.2f}"
)
logger.info(
f"Observed ttft: {self.last_metrics.ttft:.3f}s itl: {self.last_metrics.itl:.3f}s"
f"Observed ttft: {self.last_metrics.ttft:.2f}ms itl: {self.last_metrics.itl:.2f}ms"
)

self.num_req_predictor.add_data_point(self.last_metrics.num_req)
Expand Down
8 changes: 4 additions & 4 deletions docs/benchmarks/pre_deployment_profiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ spec:
- --osl
- "150" # average OSL is 150 tokens
- --ttft
- "200" # target TTFT is 200ms
- "200" # target TTFT is 200ms (float, in milliseconds)
- --itl
- "20" # target ITL is 20ms
- "20" # target ITL is 20ms (float, in milliseconds)
- --backend
- <vllm/sglang>
```
Expand Down Expand Up @@ -292,8 +292,8 @@ python3 -m benchmarks.profiler.profile_sla \
--aic-backend-version 0.20.0 \
--isl 3000 \
--osl 150 \
--ttft 0.2 \
--itl 0.02
--ttft 200 \ # target TTFT in milliseconds (float)
--itl 20 # target ITL in milliseconds (float)
```

The output will be written to `./profiling_results/` and can be used directly with SLA planner deployment.
2 changes: 1 addition & 1 deletion docs/kubernetes/sla_planner_quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
```
New adjustment interval started!
Observed num_req: X.XXX isl: X.XXX osl: X.XXX
Observed ttft: X.XXXs itl: X.XXXs
Observed ttft: X.XXms itl: X.XXms
Number of prefill workers: 1, number of decode workers: 1
```

Expand Down
20 changes: 10 additions & 10 deletions tests/planner/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,34 +34,34 @@ python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
--profile_results_dir <path_to_profile_results> \
--isl <ISL> \
--osl <OSL> \
--ttft <TTFT(s)> \
--itl <ITL(s)>
--ttft <TTFT(ms)> \
--itl <ITL(ms)>
```

The script will perform the interpolation based on ISL, OSL, and TTFT and ITL SLAs and advise the load that can saturate the engine.

For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200,
For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200 (target TTFT=200ms, ITL=10ms):

```bash
python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
--profile_results_dir tests/planner/profiling_results/H200_TP1P_TP1D/ \
--isl 3000 \
--osl 300 \
--ttft 0.2 \
--itl 0.01
--ttft 200 \
--itl 10

# output:
ISL=3000, OSL=300
TTFT=0.1s, ITL=0.01s
TTFT=200ms, ITL=10ms
Using profile results from tests/planner/profiling_results/H200_TP1P_TP1D/

Interpolating prefill performance ...
Estimated TTFT=0.060s <= target TTFT=0.200s. Requests can queue 0.140s maximally while meeting TTFT SLA.
Estimated TTFT=60.00ms <= target TTFT=200.00ms. Requests can queue 140.00ms maximally while meeting TTFT SLA.
Estimated throughput: 49481.09 tokens/s/gpu. Request rate at 16.49 requests/s will saturate one GPU.

Interpolating decode performance ...
Average context length: isl + osl/2 = 3150.
Estimated ITL=0.0097s <= target ITL=0.0100s at 16.16% active kv usage.
Estimated ITL=9.70ms <= target ITL=10.00ms at 16.16% active kv usage.
Estimated throughput: 4555.68 token/s/gpu. Request rate at 15.19 requests/s will saturate one GPU.
```

Expand Down Expand Up @@ -111,8 +111,8 @@ For example, to dry run SLA planner for the previous FP8 8B on H200 using the ge

```bash
python components/planner/test/planner_sla_dryrun.py \
--ttft 0.2 \
--itl 0.01 \
--ttft 200 \
--itl 10 \
--adjustment-interval 60 \
--profile-results-dir tests/planner/profiling_results/H200_TP1P_TP1D/ \
--dataset rr-5-45_i3000o300.jsonl \
Expand Down
4 changes: 2 additions & 2 deletions tests/planner/perf_test_configs/disagg_8b_planner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ spec:
python3 -m planner_sla
--environment=kubernetes
--backend=vllm
--ttft 0.2
--itl 0.01
--ttft 200
--itl 10
--profile-results-dir /workspace/tests/planner/profiling_results/H200_TP1P_TP1D/
--adjustment-interval=60
--prometheus-port=9085
Expand Down
4 changes: 2 additions & 2 deletions tests/planner/scaling/disagg_planner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ spec:
--adjustment-interval=60
--profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
--prometheus-port=9085
--ttft=0.1
--itl=0.01
--ttft=100
--itl=10
--load-predictor=constant
--no-correction
VllmDecodeWorker:
Expand Down
4 changes: 2 additions & 2 deletions tests/planner/test_replica_calculation.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ def planner():
args.decode_engine_num_gpu = 1
args.min_endpoint = 1
args.max_gpu_budget = 10
args.ttft = 80 # ms
args.itl = 10 # ms
args.ttft = 80.0 # ms
args.itl = 10.0 # ms
args.backend = "vllm"
args.no_operation = True # Don't actually scale
args.no_correction = False # Allow correction factors
Expand Down
Loading