ai-dynamo · tedzhouhk · Jun 14, 2025 · Jun 6, 2025 · Jun 6, 2025 · Jun 9, 2025
@@ -949,6 +949,7 @@ def benchmark_decode(isl, osl, num_request, genai_perf_artifact_dir, model_name,
             y_context_length=np.array(y_context_length),
             z_itl=np.array(z_itl),
             z_thpt_per_gpu=np.array(z_thpt_per_gpu),
+            max_kv_tokens=np.array([max_kv_tokens]),
         )
         logger.info(f"Saved data points to {save_path}")
 

@@ -18,12 +18,13 @@
     "LocalConnector",
     "PlannerConnector",
     "KubernetesConnector",
-    "PlannerDefaults",
+    "LoadPlannerDefaults",
+    "SLAPlannerDefaults",
 ]
 
 # Import the classes
 from dynamo.planner.circusd import CircusController
-from dynamo.planner.defaults import PlannerDefaults
+from dynamo.planner.defaults import LoadPlannerDefaults, SLAPlannerDefaults
 from dynamo.planner.kubernetes_connector import KubernetesConnector
 from dynamo.planner.local_connector import LocalConnector
 from dynamo.planner.planner_connector import PlannerConnector
@@ -163,6 +163,7 @@ async def remove_watcher(
         waiting: bool = True,
         max_retries: int = 3,
         retry_delay: float = 2.0,
+        blocking: bool = True,
         timeout: int = 600,  # 10 minutes
     ) -> bool:
         """
@@ -185,8 +186,11 @@ async def remove_watcher(
             response = self.client.send_message("signal", name=name, signum="SIGTERM")
             if response.get("status") != "ok":
                 logger.warning(f"Failed to send SIGTERM to {name}: {response}")
+            if not blocking:
+                return True
         except Exception as e:
             logger.warning(f"Error sending SIGTERM to {name}: {e}")
+            return False
 
         # Now wait for the process to exit gracefully
         exited = await self._wait_for_process_graceful_exit(name, timeout)

@@ -15,18 +15,32 @@
 
 
 # Source of truth for planner defaults
-class PlannerDefaults:
+class BasePlannerDefaults:
     namespace = "dynamo"
     environment = "local"
     no_operation = False
     log_dir = None
-    adjustment_interval = 10
-    metric_pulling_interval = 1
+    adjustment_interval = 180  # in seconds
     max_gpu_budget = 8
-    min_endpoint = 1
+    min_endpoint = 1  # applies to both decode and prefill
+    decode_engine_num_gpu = 1
+    prefill_engine_num_gpu = 1
+
+
+class LoadPlannerDefaults(BasePlannerDefaults):
+    metric_pulling_interval = 10  # in seconds
     decode_kv_scale_up_threshold = 0.9
     decode_kv_scale_down_threshold = 0.5
     prefill_queue_scale_up_threshold = 5.0
     prefill_queue_scale_down_threshold = 0.2
-    decode_engine_num_gpu = 1
-    prefill_engine_num_gpu = 1
+
+
+class SLAPlannerDefaults(BasePlannerDefaults):
+    prometheus_endpoint = "http://localhost:9090"
+    profile_results_dir = "profiling_results"
+    isl = 3000  # in number of tokens
+    osl = 150  # in number of tokens
+    ttft = 0.5  # in seconds
+    itl = 0.05  # in seconds
+    load_predictor = "arima"  # ["constant", "arima", "prophet"]
+    load_prediction_window_size = 50  # predict load using how many recent load samples
@@ -231,10 +231,13 @@ async def remove_component(
         target_watcher = matching_components[highest_suffix]
         logger.info(f"Removing watcher {target_watcher}")
 
-        success = await self.circus.remove_watcher(name=target_watcher)
-        logger.info(
-            f"Circus remove_watcher for {target_watcher} {'succeeded' if success else 'failed'}"
+        success = await self.circus.remove_watcher(
+            name=target_watcher, blocking=blocking
         )
+        if not blocking:
+            logger.info(
+                f"Circus remove_watcher for {target_watcher} {'succeeded' if success else 'failed'}"
+            )
 
         if success:
             if highest_suffix > 0:  # Numbered watcher - remove entire entry

@@ -0,0 +1,115 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import asyncio
+import logging
+
+from pydantic import BaseModel
+
+from dynamo.planner.defaults import SLAPlannerDefaults
+from dynamo.planner.utils.planner_core import start_sla_planner
+from dynamo.runtime.logging import configure_dynamo_logging
+from dynamo.sdk import async_on_start, dynamo_context, endpoint, service
+from dynamo.sdk.core.protocol.interface import ComponentType
+from dynamo.sdk.lib.config import ServiceConfig
+from dynamo.sdk.lib.image import DYNAMO_IMAGE
+
+logger = logging.getLogger(__name__)
+
+# start planner 30 seconds after the other components to make sure planner can see them
+# TODO: remove this delay
+INIT_PLANNER_START_DELAY = 30
+
+
+class RequestType(BaseModel):
+    text: str
+
+
+@service(
+    dynamo={
+        "namespace": "dynamo",
+        "component_type": ComponentType.PLANNER,
+    },
+    resources={"cpu": "10", "memory": "20Gi"},
+    workers=1,
+    image=DYNAMO_IMAGE,
+)
+class Planner:
+    def __init__(self):
+        configure_dynamo_logging(service_name="Planner")
+        logger.info("Starting planner")
+        self.runtime = dynamo_context["runtime"]
+
+        config = ServiceConfig.get_instance()
+
+        # Get namespace directly from dynamo_context as it contains the active namespace
+        self.namespace = dynamo_context["namespace"]
+        config_instance = config.get("Planner", {})
+
+        self.args = argparse.Namespace(
+            namespace=self.namespace,
+            environment=config_instance.get(
+                "environment", SLAPlannerDefaults.environment
+            ),
+            no_operation=config_instance.get(
+                "no-operation", SLAPlannerDefaults.no_operation
+            ),
+            log_dir=config_instance.get("log-dir", SLAPlannerDefaults.log_dir),
+            adjustment_interval=config_instance.get(
+                "adjustment-interval", SLAPlannerDefaults.adjustment_interval
+            ),
+            max_gpu_budget=config_instance.get(
+                "max-gpu-budget", SLAPlannerDefaults.max_gpu_budget
+            ),
+            min_endpoint=config_instance.get(
+                "min-endpoint", SLAPlannerDefaults.min_endpoint
+            ),
+            decode_engine_num_gpu=config_instance.get(
+                "decode-engine-num-gpu", SLAPlannerDefaults.decode_engine_num_gpu
+            ),
+            prefill_engine_num_gpu=config_instance.get(
+                "prefill-engine-num-gpu", SLAPlannerDefaults.prefill_engine_num_gpu
+            ),
+            prometheus_endpoint=config_instance.get(
+                "prometheus-endpoint", SLAPlannerDefaults.prometheus_endpoint
+            ),
+            profile_results_dir=config_instance.get(
+                "profile-results-dir", SLAPlannerDefaults.profile_results_dir
+            ),
+            isl=config_instance.get("isl", SLAPlannerDefaults.isl),
+            osl=config_instance.get("osl", SLAPlannerDefaults.osl),
+            ttft=config_instance.get("ttft", SLAPlannerDefaults.ttft),
+            itl=config_instance.get("itl", SLAPlannerDefaults.itl),
+            load_predictor=config_instance.get(
+                "load-predictor", SLAPlannerDefaults.load_predictor
+            ),
+            load_prediction_window_size=config_instance.get(
+                "load-prediction-window-size",
+                SLAPlannerDefaults.load_prediction_window_size,
+            ),
+        )
+
+    @async_on_start
+    async def async_init(self):
+        await asyncio.sleep(INIT_PLANNER_START_DELAY)
+        logger.info("Calling start_planner")
+        await start_sla_planner(self.runtime, self.args)
+        logger.info("Planner started")
+
+    @endpoint()
+    async def generate(self, request: RequestType):
+        """Dummy endpoint to satisfy that each component has an endpoint"""
+        yield "mock endpoint"
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import subprocess
+import tempfile
+
+import yaml
+
+from dynamo.sdk import service
+from dynamo.sdk.lib.config import ServiceConfig
+from dynamo.sdk.lib.image import DYNAMO_IMAGE
+
+logger = logging.getLogger(__name__)
+
+
+@service(
+    dynamo={
+        "namespace": "dynamo",
+    },
+    workers=1,
+    image=DYNAMO_IMAGE,
+)
+class Prometheus:
+    def __init__(self):
+        """Initialize Frontend service with HTTP server and model configuration."""
+        self.config = ServiceConfig.get_parsed_config("Prometheus")
+        self.process = None
+
+        logger.warning(f"Prometheus config: {self.config}")
+
+        self.start_prometheus_server()
+
+    def start_prometheus_server(self):
+        logger.info("Starting prometheus server...")
+
+        self.temp_file = tempfile.NamedTemporaryFile(
+            mode="w", suffix=".yml", delete=False
+        )
+        yaml.dump(self.config, self.temp_file)
+        self.temp_file.close()
+        config_path = self.temp_file.name
+
+        cmd = [
+            "prometheus",
+            f"--config.file={config_path}",
+        ]
+
+        logger.info(f"Prometheus cmd: {cmd}")
+
+        self.process = subprocess.Popen(
+            cmd,
+            stdout=None,
+            stderr=None,
+        )
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.