ai-dynamo · saturley-hall · Oct 17, 2025 · Oct 17, 2025
@@ -815,10 +815,10 @@ async def run_profile(args):
         "--osl", type=int, default=500, help="target output sequence length"
     )
     parser.add_argument(
-        "--ttft", type=int, default=50, help="target Time To First Token in ms"
+        "--ttft", type=float, default=50.0, help="target Time To First Token in ms"
     )
     parser.add_argument(
-        "--itl", type=int, default=10, help="target Inter Token Latency in ms"
+        "--itl", type=float, default=10.0, help="target Inter Token Latency in ms"
     )
 
     # arguments used for interpolating TTFT and ITL under different ISL/OSL

@@ -89,8 +89,8 @@ class SLAPlannerDefaults(BasePlannerDefaults):
     profile_results_dir = "profiling_results"
     isl = 3000  # in number of tokens
     osl = 150  # in number of tokens
-    ttft = 0.5  # in seconds
-    itl = 0.05  # in seconds
+    ttft = 500.0  # in milliseconds
+    itl = 50.0  # in milliseconds
     load_predictor = "arima"  # ["constant", "arima", "prophet"]
     load_prediction_window_size = 50  # predict load using how many recent load samples
     no_correction = False  # disable correction factor, might be useful under some conditions like long cold start time

@@ -51,9 +51,7 @@ def __init__(
             try:
                 with np.load(prefill_npz_fn) as raw_data:
                     self.prefill_isl = raw_data["prefill_isl"]
-                    self.prefill_ttft = (
-                        raw_data["prefill_ttft"] / 1000
-                    )  # convert ms to s
+                    self.prefill_ttft = raw_data["prefill_ttft"]  # in milliseconds
                     self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"]
             except FileNotFoundError:
                 logger.error(
@@ -64,7 +62,7 @@ def __init__(
 
         elif raw_data:
             self.prefill_isl = raw_data["prefill_isl"]
-            self.prefill_ttft = raw_data["prefill_ttft"] / 1000  # convert ms to s
+            self.prefill_ttft = raw_data["prefill_ttft"]  # in milliseconds
             self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"]
         else:
             raise ValueError("Either profile_results_dir or raw_data must be provided")
@@ -150,7 +148,7 @@ def __init__(
                 method="nearest",
             )
             self.itl_interpolator[nan_mask] = itl_nearest[nan_mask]
-        self.itl_interpolator /= 1000  # convert ms to s
+        # ITL values are in milliseconds
 
         self.thpt_interpolator = scipy.interpolate.griddata(
             (self.x_kv_usage, self.y_context_length),
@@ -230,12 +228,12 @@ def find_best_throughput_per_gpu(
     parser.add_argument("--profile_results_dir", type=str, required=True)
     parser.add_argument("--isl", type=int, default=3000)
     parser.add_argument("--osl", type=int, default=150)
-    parser.add_argument("--ttft", type=float, default=0.1, help="in s")
-    parser.add_argument("--itl", type=float, default=0.01, help="in s")
+    parser.add_argument("--ttft", type=float, default=100.0, help="in milliseconds")
+    parser.add_argument("--itl", type=float, default=10.0, help="in milliseconds")
     args = parser.parse_args()
 
     print(f"ISL={args.isl}, OSL={args.osl}")
-    print(f"TTFT={args.ttft}s, ITL={args.itl}s")
+    print(f"TTFT={args.ttft}ms, ITL={args.itl}ms")
     print(f"Using profile results from {args.profile_results_dir}")
     print("")
 
@@ -248,11 +246,11 @@ def find_best_throughput_per_gpu(
 
     if est_ttft <= args.ttft:
         print(
-            f"\tEstimated TTFT={est_ttft:.3f}s <= target TTFT={args.ttft:.3f}s. Requests can queue {args.ttft - est_ttft:.3f}s maximally while meeting TTFT SLA."
+            f"\tEstimated TTFT={est_ttft:.2f}ms <= target TTFT={args.ttft:.2f}ms. Requests can queue {args.ttft - est_ttft:.2f}ms maximally while meeting TTFT SLA."
         )
     else:
         print(
-            f"\tEstimated TTFT={est_ttft:.3f}s > target TTFT={args.ttft:.3f}s. Cannot meet TTFT SLA."
+            f"\tEstimated TTFT={est_ttft:.2f}ms > target TTFT={args.ttft:.2f}ms. Cannot meet TTFT SLA."
         )
 
     print(
@@ -274,12 +272,12 @@ def find_best_throughput_per_gpu(
     ) = decode_interpolator.find_best_throughput_per_gpu(args.itl, context_length)
     if est_itl <= args.itl:
         print(
-            f"\tEstimated ITL={est_itl:.4f}s <= target ITL={args.itl:.4f}s at {est_kv_usage*100:.2f}% active kv usage."
+            f"\tEstimated ITL={est_itl:.2f}ms <= target ITL={args.itl:.2f}ms at {est_kv_usage*100:.2f}% active kv usage."
         )
         print(
             f"\tEstimated throughput: {est_thpt_per_gpu:.2f} token/s/gpu. Request rate at {est_thpt_per_gpu / args.osl:.2f} requests/s will saturate one GPU."
         )
     else:
         print(
-            f"\tEstimated ITL={est_itl:.4f}s > target ITL={args.itl:.4f}s. Cannot meet ITL SLA."
+            f"\tEstimated ITL={est_itl:.2f}ms > target ITL={args.itl:.2f}ms. Cannot meet ITL SLA."
         )
@@ -90,10 +90,13 @@ def create_sla_planner_parser() -> argparse.ArgumentParser:
         "--ttft",
         type=float,
         default=SLAPlannerDefaults.ttft,
-        help="Time to first token",
+        help="Time to first token (float, in milliseconds)",
     )
     parser.add_argument(
-        "--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency"
+        "--itl",
+        type=float,
+        default=SLAPlannerDefaults.itl,
+        help="Inter-token latency (float, in milliseconds)",
     )
     parser.add_argument(
         "--load-predictor",

@@ -249,13 +249,20 @@ async def observe_metrics(self):
             self.num_p_workers_gauge.set(len(self.p_endpoints))
             self.num_d_workers_gauge.set(len(self.d_endpoints))
 
-        self.last_metrics.ttft = self.prometheus_api_client.get_avg_time_to_first_token(
-            f"{self.args.adjustment_interval}s",
-            self.model_name,
+        # Prometheus returns seconds, convert to milliseconds
+        self.last_metrics.ttft = (
+            self.prometheus_api_client.get_avg_time_to_first_token(
+                f"{self.args.adjustment_interval}s",
+                self.model_name,
+            )
+            * 1000
         )
-        self.last_metrics.itl = self.prometheus_api_client.get_avg_inter_token_latency(
-            f"{self.args.adjustment_interval}s",
-            self.model_name,
+        self.last_metrics.itl = (
+            self.prometheus_api_client.get_avg_inter_token_latency(
+                f"{self.args.adjustment_interval}s",
+                self.model_name,
+            )
+            * 1000
         )
         self.last_metrics.num_req = self.prometheus_api_client.get_avg_request_count(
             f"{self.args.adjustment_interval}s",
@@ -284,7 +291,7 @@ async def observe_metrics(self):
             f"Observed num_req: {self.last_metrics.num_req:.2f} isl: {self.last_metrics.isl:.2f} osl: {self.last_metrics.osl:.2f}"
         )
         logger.info(
-            f"Observed ttft: {self.last_metrics.ttft:.3f}s itl: {self.last_metrics.itl:.3f}s"
+            f"Observed ttft: {self.last_metrics.ttft:.2f}ms itl: {self.last_metrics.itl:.2f}ms"
         )
 
         self.num_req_predictor.add_data_point(self.last_metrics.num_req)

diff --git a/docs/benchmarks/pre_deployment_profiling.md b/docs/benchmarks/pre_deployment_profiling.md
@@ -119,9 +119,9 @@ spec:
             - --osl
             - "150" # average OSL is 150 tokens
             - --ttft
-            - "200" # target TTFT is 200ms
+            - "200" # target TTFT is 200ms (float, in milliseconds)
             - --itl
-            - "20" # target ITL is 20ms
+            - "20" # target ITL is 20ms (float, in milliseconds)
             - --backend
             - <vllm/sglang>
 ```
@@ -292,8 +292,8 @@ python3 -m benchmarks.profiler.profile_sla \
    --aic-backend-version 0.20.0 \
    --isl 3000 \
    --osl 150 \
-   --ttft 0.2 \
-   --itl 0.02
+   --ttft 200 \ # target TTFT in milliseconds (float)
+   --itl 20 # target ITL in milliseconds (float)
 ```
 
 The output will be written to `./profiling_results/` and can be used directly with SLA planner deployment.
diff --git a/docs/kubernetes/sla_planner_quickstart.md b/docs/kubernetes/sla_planner_quickstart.md
@@ -203,7 +203,7 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
 ```
 New adjustment interval started!
 Observed num_req: X.XXX isl: X.XXX osl: X.XXX
-Observed ttft: X.XXXs itl: X.XXXs
+Observed ttft: X.XXms itl: X.XXms
 Number of prefill workers: 1, number of decode workers: 1
 ```
 

@@ -34,34 +34,34 @@ python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
   --profile_results_dir <path_to_profile_results> \
   --isl <ISL> \
   --osl <OSL> \
-  --ttft <TTFT(s)> \
-  --itl <ITL(s)>
+  --ttft <TTFT(ms)> \
+  --itl <ITL(ms)>
 ```
 
 The script will perform the interpolation based on ISL, OSL, and TTFT and ITL SLAs and advise the load that can saturate the engine.
 
-For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200,
+For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200 (target TTFT=200ms, ITL=10ms):
 
 ```bash
 python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
   --profile_results_dir tests/planner/profiling_results/H200_TP1P_TP1D/ \
   --isl 3000 \
   --osl 300 \
-  --ttft 0.2 \
-  --itl 0.01
+  --ttft 200 \
+  --itl 10
 
 # output:
 ISL=3000, OSL=300
-TTFT=0.1s, ITL=0.01s
+TTFT=200ms, ITL=10ms
 Using profile results from tests/planner/profiling_results/H200_TP1P_TP1D/
 
 Interpolating prefill performance ...
-        Estimated TTFT=0.060s <= target TTFT=0.200s. Requests can queue 0.140s maximally while meeting TTFT SLA.
+        Estimated TTFT=60.00ms <= target TTFT=200.00ms. Requests can queue 140.00ms maximally while meeting TTFT SLA.
         Estimated throughput: 49481.09 tokens/s/gpu. Request rate at 16.49 requests/s will saturate one GPU.
 
 Interpolating decode performance ...
         Average context length: isl + osl/2 = 3150.
-        Estimated ITL=0.0097s <= target ITL=0.0100s at 16.16% active kv usage.
+        Estimated ITL=9.70ms <= target ITL=10.00ms at 16.16% active kv usage.
         Estimated throughput: 4555.68 token/s/gpu. Request rate at 15.19 requests/s will saturate one GPU.
 ```
 
@@ -111,8 +111,8 @@ For example, to dry run SLA planner for the previous FP8 8B on H200 using the ge
 
 ```bash
 python components/planner/test/planner_sla_dryrun.py \
-    --ttft 0.2 \
-    --itl 0.01 \
+    --ttft 200 \
+    --itl 10 \
     --adjustment-interval 60 \
     --profile-results-dir tests/planner/profiling_results/H200_TP1P_TP1D/ \
     --dataset rr-5-45_i3000o300.jsonl \

@@ -87,8 +87,8 @@ spec:
               python3 -m planner_sla
               --environment=kubernetes
               --backend=vllm
-              --ttft 0.2
-              --itl 0.01
+              --ttft 200
+              --itl 10
               --profile-results-dir /workspace/tests/planner/profiling_results/H200_TP1P_TP1D/
               --adjustment-interval=60
               --prometheus-port=9085

@@ -57,8 +57,8 @@ spec:
               --adjustment-interval=60
               --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
               --prometheus-port=9085
-              --ttft=0.1
-              --itl=0.01
+              --ttft=100
+              --itl=10
               --load-predictor=constant
               --no-correction
     VllmDecodeWorker:

@@ -49,8 +49,8 @@ def planner():
     args.decode_engine_num_gpu = 1
     args.min_endpoint = 1
     args.max_gpu_budget = 10
-    args.ttft = 80  # ms
-    args.itl = 10  # ms
+    args.ttft = 80.0  # ms
+    args.itl = 10.0  # ms
     args.backend = "vllm"
     args.no_operation = True  # Don't actually scale
     args.no_correction = False  # Allow correction factors