ai-dynamo · ishandhanani · Oct 2, 2025 · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025
diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md
@@ -117,9 +117,7 @@ uv pip install maturin
 cd $DYNAMO_HOME/lib/bindings/python
 maturin develop --uv
 cd $DYNAMO_HOME
-# installs sglang supported version along with dynamo
-# include the prerelease flag to install flashinfer rc versions
-uv pip install --prerelease=allow -e .[sglang]
+uv pip install -e .[sglang]
 ```
 
 </details>

@@ -27,6 +27,7 @@
     MultimodalPrefillWorkerHandler,
     MultimodalProcessorHandler,
     MultimodalWorkerHandler,
+    NativeApiHandler,
     PrefillWorkerHandler,
 )
 
@@ -73,9 +74,10 @@ async def init(runtime: DistributedRuntime, config: Config):
 
     generate_endpoint = component.endpoint(dynamo_args.endpoint)
 
-    # TODO: think about implementing DisaggregationStrategy for P->D
-    # TODO: implement a `next` field in the config to dynamically set the next client
+    publisher, metrics_task, metrics_labels = await setup_sgl_metrics(engine, component)
+
     prefill_client = None
+    native_api_tasks = []
     if config.serving_mode == DisaggregationMode.DECODE:
         logging.info("Initializing prefill client")
         prefill_client = (
@@ -84,8 +86,9 @@ async def init(runtime: DistributedRuntime, config: Config):
             .endpoint("generate")
             .client()
         )
-
-    publisher, metrics_task, metrics_labels = await setup_sgl_metrics(engine, component)
+    else:
+        native_api_handler = NativeApiHandler(component, engine, metrics_labels)
+        native_api_tasks = await native_api_handler.init_native_apis()
 
     kv_publisher = None
     if server_args.kv_events_config:
@@ -129,7 +132,6 @@ async def register_model():
     health_check_payload = SglangHealthCheckPayload(engine).to_dict()
 
     try:
-        # Start endpoint immediately and register model concurrently
         # Requests queue until ready_event is set
         await asyncio.gather(
             generate_endpoint.serve_endpoint(
@@ -139,6 +141,7 @@ async def register_model():
                 health_check_payload=health_check_payload,
             ),
             register_model(),
+            *native_api_tasks,
         )
     except Exception as e:
         logging.error(f"Failed to serve endpoints: {e}")

@@ -13,6 +13,7 @@
     MultimodalPrefillWorkerHandler,
     MultimodalWorkerHandler,
 )
+from .native_api_handler import NativeApiHandler
 from .prefill_handler import PrefillWorkerHandler
 
 __all__ = [
@@ -23,4 +24,5 @@
     "MultimodalEncodeWorkerHandler",
     "MultimodalWorkerHandler",
     "MultimodalPrefillWorkerHandler",
+    "NativeApiHandler",
 ]
@@ -0,0 +1,118 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# SGLang Native APIs: https://docs.sglang.ai/basic_usage/native_api.html
+# Code: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/http_server.py
+
+import asyncio
+import logging
+from typing import List, Optional, Tuple
+
+import sglang as sgl
+from sglang.srt.managers.io_struct import ProfileReqInput
+
+from dynamo._core import Component
+
+
+class NativeApiHandler:
+    """Handler to add sglang native API endpoints to workers"""
+
+    def __init__(
+        self,
+        component: Component,
+        engine: sgl.Engine,
+        metrics_labels: Optional[List[Tuple[str, str]]] = None,
+    ):
+        self.component = component
+        self.engine = engine
+        self.metrics_labels = metrics_labels
+        self.native_api_tasks = []
+
+    async def init_native_apis(
+        self,
+    ) -> List[asyncio.Task]:
+        """
+        Initialize and register native API endpoints.
+        Returns list of tasks to be gathered.
+        """
+        logging.info("Initializing native SGLang API endpoints")
+
+        self.tm = self.engine.tokenizer_manager
+
+        tasks = []
+
+        model_info_ep = self.component.endpoint("get_model_info")
+        start_profile_ep = self.component.endpoint("start_profile")
+        stop_profile_ep = self.component.endpoint("stop_profile")
+        tasks.extend(
+            [
+                model_info_ep.serve_endpoint(
+                    self.get_model_info,
+                    graceful_shutdown=True,
+                    metrics_labels=self.metrics_labels,
+                    http_endpoint_path="/get_model_info",
+                ),
+                start_profile_ep.serve_endpoint(
+                    self.start_profile,
+                    graceful_shutdown=True,
+                    metrics_labels=self.metrics_labels,
+                    http_endpoint_path="/start_profile",
+                ),
+                stop_profile_ep.serve_endpoint(
+                    self.stop_profile,
+                    graceful_shutdown=True,
+                    metrics_labels=self.metrics_labels,
+                    http_endpoint_path="/stop_profile",
+                ),
+            ]
+        )
+
+        self.native_api_tasks = tasks
+        logging.info(f"Registered {len(tasks)} native API endpoints")
+        return tasks
+
+    async def get_model_info(self, request: dict):
+        result = {
+            "model_path": self.tm.server_args.model_path,
+            "tokenizer_path": self.tm.server_args.tokenizer_path,
+            "preferred_sampling_params": self.tm.server_args.preferred_sampling_params,
+            "weight_version": self.tm.server_args.weight_version,
+        }
+
+        yield {"data": [result]}
+
+    async def start_profile(self, request: dict):
+        try:
+            obj = ProfileReqInput.model_validate(request)
+        except Exception:
+            obj = None
+
+        if obj is None:
+            obj = ProfileReqInput()
+
+        output_dir = obj.output_dir or f"profile_{self.tm.server_args.model_path}"
+
+        await self.tm.start_profile(
+            output_dir=output_dir,
+            start_step=obj.start_step,
+            num_steps=obj.num_steps,
+            activities=obj.activities,
+            with_stack=obj.with_stack,
+            record_shapes=obj.record_shapes,
+            profile_by_stage=obj.profile_by_stage,
+        )
+
+        yield {"data": [{"status": "started profile"}]}
+
+    async def stop_profile(self, request: dict):
+        asyncio.create_task(self.tm.stop_profile())
+        yield {
+            "data": [
+                {
+                    "status": (
+                        "Stopped profile. This might take a long time to complete. "
+                        f"Results should be available in the 'profile_{self.tm.server_args.model_path}' directory."
+                    )
+                }
+            ]
+        }
@@ -643,14 +643,15 @@ impl Component {
 
 #[pymethods]
 impl Endpoint {
-    #[pyo3(signature = (generator, graceful_shutdown = true, metrics_labels = None, health_check_payload = None))]
+    #[pyo3(signature = (generator, graceful_shutdown = true, metrics_labels = None, health_check_payload = None, http_endpoint_path = None))]
     fn serve_endpoint<'p>(
         &self,
         py: Python<'p>,
         generator: PyObject,
         graceful_shutdown: Option<bool>,
         metrics_labels: Option<Vec<(String, String)>>,
         health_check_payload: Option<&Bound<'p, PyDict>>,
+        http_endpoint_path: Option<&str>,
     ) -> PyResult<Bound<'p, PyAny>> {
         let engine = Arc::new(engine::PythonAsyncEngine::new(
             generator,
@@ -688,6 +689,10 @@ impl Endpoint {
             builder = builder.health_check_payload(payload);
         }
 
+        if let Some(http_endpoint_path) = http_endpoint_path {
+            builder = builder.http_endpoint_path(http_endpoint_path);
+        }
+
         let graceful_shutdown = graceful_shutdown.unwrap_or(true);
         pyo3_async_runtimes::tokio::future_into_py(py, async move {
             builder

diff --git a/lib/bindings/python/src/dynamo/_core.pyi b/lib/bindings/python/src/dynamo/_core.pyi
@@ -115,7 +115,7 @@ class Endpoint:
 
     ...
 
-    async def serve_endpoint(self, handler: RequestHandler, graceful_shutdown: bool = True, metrics_labels: Optional[List[Tuple[str, str]]] = None, health_check_payload: Optional[Dict[str, Any]] = None) -> None:
+    async def serve_endpoint(self, handler: RequestHandler, graceful_shutdown: bool = True, metrics_labels: Optional[List[Tuple[str, str]]] = None, health_check_payload: Optional[Dict[str, Any]] = None, http_endpoint_path: Optional[str] = None) -> None:
         """
         Serve an endpoint discoverable by all connected clients at
         `{{ namespace }}/components/{{ component_name }}/endpoints/{{ endpoint_name }}`

@@ -21,6 +21,7 @@
 mod openai;
 
 pub mod disconnect;
+pub mod dynamic_endpoint;
 pub mod error;
 pub mod health;
 pub mod metrics;

@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use super::{RouteDoc, service_v2};
+use crate::types::Annotated;
+use axum::{
+    Json, Router,
+    http::{Method, StatusCode},
+    response::IntoResponse,
+    routing::post,
+};
+use dynamo_runtime::instances::list_all_instances;
+use dynamo_runtime::{DistributedRuntime, Runtime, component::Client};
+use dynamo_runtime::{pipeline::PushRouter, stream::StreamExt};
+use std::sync::Arc;
+
+pub fn dynamic_endpoint_router(
+    state: Arc<service_v2::State>,
+    path: Option<String>,
+) -> (Vec<RouteDoc>, Router) {
+    let wildcard_path = "/{*path}";
+    let path = path.unwrap_or_else(|| wildcard_path.to_string());
+
+    let docs: Vec<RouteDoc> = vec![RouteDoc::new(Method::POST, &path)];
+
+    let router = Router::new()
+        .route(&path, post(dynamic_endpoint_handler))
+        .with_state(state);
+
+    (docs, router)
+}
+
+async fn inner_dynamic_endpoint_handler(
+    state: Arc<service_v2::State>,
+    path: String,
+) -> Result<impl IntoResponse, &'static str> {
+    let etcd_client = state.etcd_client().ok_or("Failed to get etcd client")?;
+
+    let instances = list_all_instances(etcd_client)
+        .await
+        .map_err(|_| "Failed to get instances")?;
+
+    let dynamic_endpoints = instances
+        .iter()
+        .filter_map(|instance| instance.http_endpoint_path.clone())
+        .collect::<Vec<String>>();
+
+    let fmt_path = format!("/{}", &path);
+    if !dynamic_endpoints.contains(&fmt_path) {
+        return Err("Dynamic endpoint not found");
+    }
+
+    let rt = Runtime::from_current().map_err(|_| "Failed to get runtime")?;
+    let drt = DistributedRuntime::from_settings(rt)
+        .await
+        .map_err(|_| "Failed to get distributed runtime")?;
+
+    let target_instances = instances
+        .iter()
+        .filter(|instance| instance.http_endpoint_path == Some(fmt_path.clone()))
+        .collect::<Vec<_>>();
+
+    let mut target_clients: Vec<Client> = Vec::new();
+    for instance in target_instances {
+        let ns = drt
+            .namespace(instance.namespace.clone())
+            .map_err(|_| "Failed to get namespace")?;
+        let c = ns
+            .component(instance.component.clone())
+            .map_err(|_| "Failed to get component")?;
+        let ep = c.endpoint(path.clone());
+        let client = ep.client().await.map_err(|_| "Failed to get client")?;
+        target_clients.push(client);
+    }
+
+    let mut all_responses = Vec::new();
+    for client in target_clients {
+        let router =
+            PushRouter::<(), Annotated<serde_json::Value>>::from_client(client, Default::default())
+                .await
+                .map_err(|_| "Failed to get router")?;
+
+        let mut stream = router
+            .round_robin(().into())
+            .await
+            .map_err(|_| "Failed to route")?;
+
+        while let Some(resp) = stream.next().await {
+            all_responses.push(resp);
+        }
+    }
+
+    Ok(Json(serde_json::json!({
+        "responses": all_responses
+    })))
+}
+
+async fn dynamic_endpoint_handler(
+    axum::extract::State(state): axum::extract::State<Arc<service_v2::State>>,
+    axum::extract::Path(path): axum::extract::Path<String>,
+) -> impl IntoResponse {
+    inner_dynamic_endpoint_handler(state, path)
+        .await
+        .map_err(|err_string| {
+            (
+                StatusCode::INTERNAL_SERVER_ERROR,
+                Json(serde_json::json!({
+                    "message": err_string
+                })),
+            )
+        })
+}
@@ -325,6 +325,7 @@ impl HttpServiceConfigBuilder {
             super::openai::list_models_router(state.clone(), var(HTTP_SVC_MODELS_PATH_ENV).ok()),
             super::health::health_check_router(state.clone(), var(HTTP_SVC_HEALTH_PATH_ENV).ok()),
             super::health::live_check_router(state.clone(), var(HTTP_SVC_LIVE_PATH_ENV).ok()),
+            super::dynamic_endpoint::dynamic_endpoint_router(state.clone(), None),
         ];
 
         let endpoint_routes =

@@ -101,6 +101,8 @@ pub struct Instance {
     pub namespace: String,
     pub instance_id: i64,
     pub transport: TransportType,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub http_endpoint_path: Option<String>,
 }
 
 impl Instance {
@@ -463,7 +465,7 @@ impl Endpoint {
         .expect("Endpoint name and component name should be valid")
     }
 
-    /// The fully path of an instance in etcd
+    /// The full path of an instance in etcd
     pub fn etcd_path_with_lease_id(&self, lease_id: i64) -> String {
         format!("{INSTANCE_ROOT_PATH}/{}", self.unique_path(lease_id))
     }