From 2d9001ac1a1496bf9a42b62a5885d86010001920 Mon Sep 17 00:00:00 2001 From: William Arnold <7565007+Aphoh@users.noreply.github.com> Date: Fri, 17 Oct 2025 11:33:41 -0700 Subject: [PATCH] fix: standardize all planner ttft/itl units to float ms and fix docs (#3673) Signed-off-by: William Arnold <7565007+Aphoh@users.noreply.github.com> --- benchmarks/profiler/profile_sla.py | 4 ++-- components/src/dynamo/planner/defaults.py | 4 ++-- .../planner/utils/perf_interpolation.py | 22 +++++++++---------- .../dynamo/planner/utils/planner_argparse.py | 7 ++++-- .../src/dynamo/planner/utils/planner_core.py | 21 ++++++++++++------ docs/benchmarks/pre_deployment_profiling.md | 8 +++---- docs/kubernetes/sla_planner_quickstart.md | 2 +- tests/planner/README.md | 20 ++++++++--------- .../perf_test_configs/disagg_8b_planner.yaml | 4 ++-- tests/planner/scaling/disagg_planner.yaml | 4 ++-- tests/planner/test_replica_calculation.py | 4 ++-- 11 files changed, 54 insertions(+), 46 deletions(-) diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py index 757c073b2851..e9a3cfa40ba3 100644 --- a/benchmarks/profiler/profile_sla.py +++ b/benchmarks/profiler/profile_sla.py @@ -815,10 +815,10 @@ async def run_profile(args): "--osl", type=int, default=500, help="target output sequence length" ) parser.add_argument( - "--ttft", type=int, default=50, help="target Time To First Token in ms" + "--ttft", type=float, default=50.0, help="target Time To First Token in ms" ) parser.add_argument( - "--itl", type=int, default=10, help="target Inter Token Latency in ms" + "--itl", type=float, default=10.0, help="target Inter Token Latency in ms" ) # arguments used for interpolating TTFT and ITL under different ISL/OSL diff --git a/components/src/dynamo/planner/defaults.py b/components/src/dynamo/planner/defaults.py index 56c63fbbd079..e66337533f1f 100644 --- a/components/src/dynamo/planner/defaults.py +++ b/components/src/dynamo/planner/defaults.py @@ -89,8 +89,8 @@ class SLAPlannerDefaults(BasePlannerDefaults): profile_results_dir = "profiling_results" isl = 3000 # in number of tokens osl = 150 # in number of tokens - ttft = 0.5 # in seconds - itl = 0.05 # in seconds + ttft = 500.0 # in milliseconds + itl = 50.0 # in milliseconds load_predictor = "arima" # ["constant", "arima", "prophet"] load_prediction_window_size = 50 # predict load using how many recent load samples no_correction = False # disable correction factor, might be useful under some conditions like long cold start time diff --git a/components/src/dynamo/planner/utils/perf_interpolation.py b/components/src/dynamo/planner/utils/perf_interpolation.py index 8c5408214764..eccc079f2b72 100644 --- a/components/src/dynamo/planner/utils/perf_interpolation.py +++ b/components/src/dynamo/planner/utils/perf_interpolation.py @@ -51,9 +51,7 @@ def __init__( try: with np.load(prefill_npz_fn) as raw_data: self.prefill_isl = raw_data["prefill_isl"] - self.prefill_ttft = ( - raw_data["prefill_ttft"] / 1000 - ) # convert ms to s + self.prefill_ttft = raw_data["prefill_ttft"] # in milliseconds self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"] except FileNotFoundError: logger.error( @@ -64,7 +62,7 @@ def __init__( elif raw_data: self.prefill_isl = raw_data["prefill_isl"] - self.prefill_ttft = raw_data["prefill_ttft"] / 1000 # convert ms to s + self.prefill_ttft = raw_data["prefill_ttft"] # in milliseconds self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"] else: raise ValueError("Either profile_results_dir or raw_data must be provided") @@ -150,7 +148,7 @@ def __init__( method="nearest", ) self.itl_interpolator[nan_mask] = itl_nearest[nan_mask] - self.itl_interpolator /= 1000 # convert ms to s + # ITL values are in milliseconds self.thpt_interpolator = scipy.interpolate.griddata( (self.x_kv_usage, self.y_context_length), @@ -230,12 +228,12 @@ def find_best_throughput_per_gpu( parser.add_argument("--profile_results_dir", type=str, required=True) parser.add_argument("--isl", type=int, default=3000) parser.add_argument("--osl", type=int, default=150) - parser.add_argument("--ttft", type=float, default=0.1, help="in s") - parser.add_argument("--itl", type=float, default=0.01, help="in s") + parser.add_argument("--ttft", type=float, default=100.0, help="in milliseconds") + parser.add_argument("--itl", type=float, default=10.0, help="in milliseconds") args = parser.parse_args() print(f"ISL={args.isl}, OSL={args.osl}") - print(f"TTFT={args.ttft}s, ITL={args.itl}s") + print(f"TTFT={args.ttft}ms, ITL={args.itl}ms") print(f"Using profile results from {args.profile_results_dir}") print("") @@ -248,11 +246,11 @@ def find_best_throughput_per_gpu( if est_ttft <= args.ttft: print( - f"\tEstimated TTFT={est_ttft:.3f}s <= target TTFT={args.ttft:.3f}s. Requests can queue {args.ttft - est_ttft:.3f}s maximally while meeting TTFT SLA." + f"\tEstimated TTFT={est_ttft:.2f}ms <= target TTFT={args.ttft:.2f}ms. Requests can queue {args.ttft - est_ttft:.2f}ms maximally while meeting TTFT SLA." ) else: print( - f"\tEstimated TTFT={est_ttft:.3f}s > target TTFT={args.ttft:.3f}s. Cannot meet TTFT SLA." + f"\tEstimated TTFT={est_ttft:.2f}ms > target TTFT={args.ttft:.2f}ms. Cannot meet TTFT SLA." ) print( @@ -274,12 +272,12 @@ def find_best_throughput_per_gpu( ) = decode_interpolator.find_best_throughput_per_gpu(args.itl, context_length) if est_itl <= args.itl: print( - f"\tEstimated ITL={est_itl:.4f}s <= target ITL={args.itl:.4f}s at {est_kv_usage*100:.2f}% active kv usage." + f"\tEstimated ITL={est_itl:.2f}ms <= target ITL={args.itl:.2f}ms at {est_kv_usage*100:.2f}% active kv usage." ) print( f"\tEstimated throughput: {est_thpt_per_gpu:.2f} token/s/gpu. Request rate at {est_thpt_per_gpu / args.osl:.2f} requests/s will saturate one GPU." ) else: print( - f"\tEstimated ITL={est_itl:.4f}s > target ITL={args.itl:.4f}s. Cannot meet ITL SLA." + f"\tEstimated ITL={est_itl:.2f}ms > target ITL={args.itl:.2f}ms. Cannot meet ITL SLA." ) diff --git a/components/src/dynamo/planner/utils/planner_argparse.py b/components/src/dynamo/planner/utils/planner_argparse.py index 9832742dc990..4ecda6f07ab4 100644 --- a/components/src/dynamo/planner/utils/planner_argparse.py +++ b/components/src/dynamo/planner/utils/planner_argparse.py @@ -90,10 +90,13 @@ def create_sla_planner_parser() -> argparse.ArgumentParser: "--ttft", type=float, default=SLAPlannerDefaults.ttft, - help="Time to first token", + help="Time to first token (float, in milliseconds)", ) parser.add_argument( - "--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency" + "--itl", + type=float, + default=SLAPlannerDefaults.itl, + help="Inter-token latency (float, in milliseconds)", ) parser.add_argument( "--load-predictor", diff --git a/components/src/dynamo/planner/utils/planner_core.py b/components/src/dynamo/planner/utils/planner_core.py index 00521b01df07..a87251ba8389 100644 --- a/components/src/dynamo/planner/utils/planner_core.py +++ b/components/src/dynamo/planner/utils/planner_core.py @@ -249,13 +249,20 @@ async def observe_metrics(self): self.num_p_workers_gauge.set(len(self.p_endpoints)) self.num_d_workers_gauge.set(len(self.d_endpoints)) - self.last_metrics.ttft = self.prometheus_api_client.get_avg_time_to_first_token( - f"{self.args.adjustment_interval}s", - self.model_name, + # Prometheus returns seconds, convert to milliseconds + self.last_metrics.ttft = ( + self.prometheus_api_client.get_avg_time_to_first_token( + f"{self.args.adjustment_interval}s", + self.model_name, + ) + * 1000 ) - self.last_metrics.itl = self.prometheus_api_client.get_avg_inter_token_latency( - f"{self.args.adjustment_interval}s", - self.model_name, + self.last_metrics.itl = ( + self.prometheus_api_client.get_avg_inter_token_latency( + f"{self.args.adjustment_interval}s", + self.model_name, + ) + * 1000 ) self.last_metrics.num_req = self.prometheus_api_client.get_avg_request_count( f"{self.args.adjustment_interval}s", @@ -284,7 +291,7 @@ async def observe_metrics(self): f"Observed num_req: {self.last_metrics.num_req:.2f} isl: {self.last_metrics.isl:.2f} osl: {self.last_metrics.osl:.2f}" ) logger.info( - f"Observed ttft: {self.last_metrics.ttft:.3f}s itl: {self.last_metrics.itl:.3f}s" + f"Observed ttft: {self.last_metrics.ttft:.2f}ms itl: {self.last_metrics.itl:.2f}ms" ) self.num_req_predictor.add_data_point(self.last_metrics.num_req) diff --git a/docs/benchmarks/pre_deployment_profiling.md b/docs/benchmarks/pre_deployment_profiling.md index 6160fbb30df2..74ca4df2b39c 100644 --- a/docs/benchmarks/pre_deployment_profiling.md +++ b/docs/benchmarks/pre_deployment_profiling.md @@ -119,9 +119,9 @@ spec: - --osl - "150" # average OSL is 150 tokens - --ttft - - "200" # target TTFT is 200ms + - "200" # target TTFT is 200ms (float, in milliseconds) - --itl - - "20" # target ITL is 20ms + - "20" # target ITL is 20ms (float, in milliseconds) - --backend - ``` @@ -292,8 +292,8 @@ python3 -m benchmarks.profiler.profile_sla \ --aic-backend-version 0.20.0 \ --isl 3000 \ --osl 150 \ - --ttft 0.2 \ - --itl 0.02 + --ttft 200 \ # target TTFT in milliseconds (float) + --itl 20 # target ITL in milliseconds (float) ``` The output will be written to `./profiling_results/` and can be used directly with SLA planner deployment. diff --git a/docs/kubernetes/sla_planner_quickstart.md b/docs/kubernetes/sla_planner_quickstart.md index 8e65be24fa12..b5405523080d 100644 --- a/docs/kubernetes/sla_planner_quickstart.md +++ b/docs/kubernetes/sla_planner_quickstart.md @@ -203,7 +203,7 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10 ``` New adjustment interval started! Observed num_req: X.XXX isl: X.XXX osl: X.XXX -Observed ttft: X.XXXs itl: X.XXXs +Observed ttft: X.XXms itl: X.XXms Number of prefill workers: 1, number of decode workers: 1 ``` diff --git a/tests/planner/README.md b/tests/planner/README.md index 14a7112f715b..f8719cc928d1 100644 --- a/tests/planner/README.md +++ b/tests/planner/README.md @@ -34,34 +34,34 @@ python components/planner/src/dynamo/planner/utils/perf_interpolation.py \ --profile_results_dir \ --isl \ --osl \ - --ttft \ - --itl + --ttft \ + --itl ``` The script will perform the interpolation based on ISL, OSL, and TTFT and ITL SLAs and advise the load that can saturate the engine. -For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200, +For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200 (target TTFT=200ms, ITL=10ms): ```bash python components/planner/src/dynamo/planner/utils/perf_interpolation.py \ --profile_results_dir tests/planner/profiling_results/H200_TP1P_TP1D/ \ --isl 3000 \ --osl 300 \ - --ttft 0.2 \ - --itl 0.01 + --ttft 200 \ + --itl 10 # output: ISL=3000, OSL=300 -TTFT=0.1s, ITL=0.01s +TTFT=200ms, ITL=10ms Using profile results from tests/planner/profiling_results/H200_TP1P_TP1D/ Interpolating prefill performance ... - Estimated TTFT=0.060s <= target TTFT=0.200s. Requests can queue 0.140s maximally while meeting TTFT SLA. + Estimated TTFT=60.00ms <= target TTFT=200.00ms. Requests can queue 140.00ms maximally while meeting TTFT SLA. Estimated throughput: 49481.09 tokens/s/gpu. Request rate at 16.49 requests/s will saturate one GPU. Interpolating decode performance ... Average context length: isl + osl/2 = 3150. - Estimated ITL=0.0097s <= target ITL=0.0100s at 16.16% active kv usage. + Estimated ITL=9.70ms <= target ITL=10.00ms at 16.16% active kv usage. Estimated throughput: 4555.68 token/s/gpu. Request rate at 15.19 requests/s will saturate one GPU. ``` @@ -111,8 +111,8 @@ For example, to dry run SLA planner for the previous FP8 8B on H200 using the ge ```bash python components/planner/test/planner_sla_dryrun.py \ - --ttft 0.2 \ - --itl 0.01 \ + --ttft 200 \ + --itl 10 \ --adjustment-interval 60 \ --profile-results-dir tests/planner/profiling_results/H200_TP1P_TP1D/ \ --dataset rr-5-45_i3000o300.jsonl \ diff --git a/tests/planner/perf_test_configs/disagg_8b_planner.yaml b/tests/planner/perf_test_configs/disagg_8b_planner.yaml index ddb052becc25..eb6dcc2e8ba8 100644 --- a/tests/planner/perf_test_configs/disagg_8b_planner.yaml +++ b/tests/planner/perf_test_configs/disagg_8b_planner.yaml @@ -87,8 +87,8 @@ spec: python3 -m planner_sla --environment=kubernetes --backend=vllm - --ttft 0.2 - --itl 0.01 + --ttft 200 + --itl 10 --profile-results-dir /workspace/tests/planner/profiling_results/H200_TP1P_TP1D/ --adjustment-interval=60 --prometheus-port=9085 diff --git a/tests/planner/scaling/disagg_planner.yaml b/tests/planner/scaling/disagg_planner.yaml index 53011ffe1d9a..354858d1877c 100644 --- a/tests/planner/scaling/disagg_planner.yaml +++ b/tests/planner/scaling/disagg_planner.yaml @@ -57,8 +57,8 @@ spec: --adjustment-interval=60 --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D --prometheus-port=9085 - --ttft=0.1 - --itl=0.01 + --ttft=100 + --itl=10 --load-predictor=constant --no-correction VllmDecodeWorker: diff --git a/tests/planner/test_replica_calculation.py b/tests/planner/test_replica_calculation.py index b9effd2cb4fb..1f15f74d4e0f 100644 --- a/tests/planner/test_replica_calculation.py +++ b/tests/planner/test_replica_calculation.py @@ -49,8 +49,8 @@ def planner(): args.decode_engine_num_gpu = 1 args.min_endpoint = 1 args.max_gpu_budget = 10 - args.ttft = 80 # ms - args.itl = 10 # ms + args.ttft = 80.0 # ms + args.itl = 10.0 # ms args.backend = "vllm" args.no_operation = True # Don't actually scale args.no_correction = False # Allow correction factors