ai-dynamo · ishandhanani · Apr 13, 2026 · Mar 27, 2026 · Apr 1, 2026 · Apr 2, 2026
@@ -94,6 +94,22 @@ def create_parsers(
     return tool_call_parser, reasoning_parser
 
 
+def _normalize_prompt_token_ids(prompt_token_ids: Any) -> list[int]:
+    if isinstance(prompt_token_ids, list):
+        return prompt_token_ids
+
+    input_ids = getattr(prompt_token_ids, "input_ids", None)
+    if input_ids is not None and not isinstance(input_ids, str):
+        return list(input_ids)
+
+    if isinstance(prompt_token_ids, dict):
+        dict_input_ids = prompt_token_ids.get("input_ids")
+        if dict_input_ids is not None and not isinstance(dict_input_ids, str):
+            return list(dict_input_ids)
+
+    return list(prompt_token_ids)
+
+
 def preprocess_chat_request(
     request: dict[str, Any],
     *,
@@ -124,9 +140,9 @@ def preprocess_chat_request(
     ):
         template_kwargs["tools"] = [t.model_dump() for t in sglang_tools]
 
-    prompt_token_ids = tokenizer.apply_chat_template(messages, **template_kwargs)
-    if not isinstance(prompt_token_ids, list):
-        prompt_token_ids = list(prompt_token_ids)
+    prompt_token_ids = _normalize_prompt_token_ids(
+        tokenizer.apply_chat_template(messages, **template_kwargs)
+    )
 
     tool_call_parser, reasoning_parser = create_parsers(
         request,

@@ -40,6 +40,17 @@
 logger = logging.getLogger(__name__)
 
 
+def _runtime_config_parser_name(
+    mdc: ModelDeploymentCard,
+    key: str,
+) -> str | None:
+    runtime_config = mdc.runtime_config()
+    if not isinstance(runtime_config, dict):
+        return None
+    value = runtime_config.get(key)
+    return value if isinstance(value, str) and value else None
+
+
 def _unsupported_n_error(n: int) -> dict[str, Any]:
     return {
         "error": {
@@ -553,8 +564,14 @@ async def chat_engine_factory(
 
         eos_token_id = getattr(tokenizer, "eos_token_id", None)
 
-        tool_call_parser_name = self.tool_call_parser_name
-        reasoning_parser_name = self.reasoning_parser_name
+        tool_call_parser_name = (
+            self.tool_call_parser_name
+            or _runtime_config_parser_name(mdc, "tool_call_parser")
+        )
+        reasoning_parser_name = (
+            self.reasoning_parser_name
+            or _runtime_config_parser_name(mdc, "reasoning_parser")
+        )
 
         if tool_call_parser_name:
             logger.info("SGLang tool call parser: %s", tool_call_parser_name)

@@ -17,6 +17,7 @@
 from dynamo.frontend.sglang_prepost import (
     SglangPreprocessResult,
     SglangStreamingPostProcessor,
+    _normalize_prompt_token_ids,
     convert_tools,
     create_parsers,
     preprocess_chat_request,
@@ -26,6 +27,7 @@
     _build_dynamo_preproc,
     _init_worker,
     _map_finish_reason,
+    _runtime_config_parser_name,
 )
 from dynamo.frontend.utils import PreprocessError, random_call_id, random_uuid
 
@@ -436,6 +438,46 @@ def test_both_parsers(self):
         assert rp is not None
 
 
+class TestNormalizePromptTokenIds:
+    def test_batch_encoding_like_object_uses_input_ids(self):
+        class FakeBatchEncoding:
+            def __init__(self):
+                self.input_ids = [11, 22, 33]
+
+            def __iter__(self):
+                yield from ("input_ids", "attention_mask")
+
+        assert _normalize_prompt_token_ids(FakeBatchEncoding()) == [11, 22, 33]
+
+    def test_mapping_uses_input_ids(self):
+        assert _normalize_prompt_token_ids(
+            {"input_ids": [1, 2, 3], "attention_mask": [1, 1, 1]}
+        ) == [1, 2, 3]
+
+
+class TestRuntimeConfigParserName:
+    def test_missing_runtime_config_returns_none(self):
+        class FakeMdc:
+            def runtime_config(self):
+                return None
+
+        assert _runtime_config_parser_name(FakeMdc(), "tool_call_parser") is None
+
+    def test_missing_key_returns_none(self):
+        class FakeMdc:
+            def runtime_config(self):
+                return {"reasoning_parser": "qwen3"}
+
+        assert _runtime_config_parser_name(FakeMdc(), "tool_call_parser") is None
+
+    def test_reads_non_empty_string_value(self):
+        class FakeMdc:
+            def runtime_config(self):
+                return {"tool_call_parser": "hermes"}
+
+        assert _runtime_config_parser_name(FakeMdc(), "tool_call_parser") == "hermes"
+
+
 # ---------------------------------------------------------------------------
 # preprocess_chat_request
 # ---------------------------------------------------------------------------

diff --git a/components/src/dynamo/sglang/CLAUDE.md b/components/src/dynamo/sglang/CLAUDE.md
@@ -283,6 +283,14 @@ text-to-video-diffusion.sh  # 1-2 GPUs - Text-to-video (Wan2.1)
   Always slice with an offset, don't assume per-chunk logprobs.
 - **Zombie GPU processes**: `sgl_diffusion::scheduler` spawns a child process that
   survives parent kill. Always check `nvidia-smi` after teardown.
+- **Session control graceful degradation**: Session control is request-driven --
+  the router's `AgentController` and `StickySessionRouter` are always created but
+  activate lazily. If no worker has `--enable-streaming-session`, the router warns
+  once and ignores `session_control` in requests. On the handler side,
+  `_session_kwargs()` checks `enable_streaming_session` before injecting
+  `session_params` into SGLang calls. Both layers must agree: the router skips
+  lifecycle RPCs, and the handler skips session params. Without both guards,
+  SGLang errors with "session id does not exist".
 
 For troubleshooting (CuDNN, config.json errors, OOM, disagg connectivity), see
 `docs/backends/sglang/sglang-examples.md#troubleshooting`.

@@ -117,8 +117,15 @@ async def init_decode(
             "The chat template will be loaded but the /v1/chat/completions endpoint will not be available."
         )
 
+    # Only serve session_control when streaming sessions are enabled.
+    if getattr(server_args, "enable_streaming_session", False):
+        session_control_endpoint = runtime.endpoint(
+            f"{dynamo_args.namespace}.{dynamo_args.component}.session_control"
+        )
+        shutdown_endpoints.append(session_control_endpoint)
+
     try:
-        await asyncio.gather(
+        gather_tasks = [
             generate_endpoint.serve_endpoint(
                 handler.generate,
                 graceful_shutdown=True,
@@ -133,7 +140,12 @@ async def init_decode(
                 output_type=parse_endpoint_types(dynamo_args.endpoint_types),
                 readiness_gate=ready_event,
             ),
-        )
+        ]
+        if getattr(server_args, "enable_streaming_session", False):
+            gather_tasks.append(
+                session_control_endpoint.serve_endpoint(handler.session_control)
+            )
+        await asyncio.gather(*gather_tasks)
     except Exception as e:
         logging.error(f"Failed to serve endpoints: {e}")
         raise

@@ -108,7 +108,6 @@ def _get_bootstrap_info_for_config(
                 f"Using auto-detected local IP: {local_ip} "
                 f"({'IPv6' if local_addr.is_ipv6 else 'IPv4'})"
             )
-
         return bootstrap_host, bootstrap_port
     except Exception as e:
         logging.warning(f"Failed to get bootstrap info: {e}")

@@ -478,6 +478,78 @@ async def update_weight_version(self, body: dict) -> dict:
             "new_version": req.new_version,
         }
 
+    async def open_session(self, body: dict) -> dict:
+        """Open a streaming session for subagent KV isolation.
+
+        Args:
+            body: Dict with "session_id", optional "timeout" (default 120),
+                  and optional "capacity_of_str_len" (default 65536).
+        """
+        from sglang.srt.managers.io_struct import OpenSessionReqInput
+
+        session_id = body.get("session_id")
+        if not session_id:
+            return {"status": "error", "message": "session_id required"}
+        timeout = body.get("timeout", 120)
+        capacity = body.get("capacity_of_str_len", 65536)
+        try:
+            obj = OpenSessionReqInput(
+                capacity_of_str_len=capacity,
+                session_id=session_id,
+                streaming=True,
+                timeout=float(timeout),
+            )
+            result = await self.engine.tokenizer_manager.open_session(obj, None)
+            if result is None:
+                return {
+                    "status": "ok",
+                    "session_id": session_id,
+                    "message": "Session already exists",
+                }
+            return {"status": "ok", "session_id": result}
+        except Exception as e:
+            logging.error(f"Failed to open session {session_id}: {e}")
+            return {"status": "error", "message": str(e)}
+
+    async def close_session(self, body: dict) -> dict:
+        """Close a streaming session and release its KV resources.
+
+        Args:
+            body: Dict with "session_id".
+        """
+        from sglang.srt.managers.io_struct import CloseSessionReqInput
+
+        session_id = body.get("session_id")
+        if not session_id:
+            return {"status": "error", "message": "session_id required"}
+        try:
+            obj = CloseSessionReqInput(session_id=session_id)
+            await self.engine.tokenizer_manager.close_session(obj, None)
+            return {"status": "ok", "session_id": session_id}
+        except Exception as e:
+            logging.error(f"Failed to close session {session_id}: {e}")
+            return {"status": "error", "message": str(e)}
+
+    async def session_control(self, request, context=None):
+        """Service mesh endpoint for session lifecycle operations.
+
+        Args:
+            request: Dict with "action" key ("open_session" or "close_session")
+                     and action-specific parameters.
+            context: Optional Dynamo context (unused but required by protocol).
+
+        Yields:
+            Single dict with operation result.
+        """
+        action = request.get("action")
+        if action == "open_session":
+            result = await self.open_session(request)
+        elif action == "close_session":
+            result = await self.close_session(request)
+        else:
+            result = {"status": "error", "message": f"Unknown action: {action}"}
+        yield result
+
     def register_engine_routes(self, runtime: DistributedRuntime) -> None:
         """Register all engine routes for this handler.
 
@@ -511,6 +583,9 @@ def register_engine_routes(self, runtime: DistributedRuntime) -> None:
             self.config.dynamo_args, "enable_rl", False
         ):
             self.register_rl_engine_routes(runtime)
+        # session_control is served as a discoverable service endpoint
+        # (not an engine route) so the router can find it via
+        # component.endpoint("session_control"). See init_llm.py.
 
     @abstractmethod
     def generate(self, request: RequestT, context: Context) -> AsyncIterator[ResponseT]:
@@ -539,6 +614,18 @@ def _get_input_param(self, request: Dict[str, Any]) -> Dict[str, Any]:
             "prompt" if isinstance(request_input, str) else "input_ids": request_input
         }
 
+    def _session_kwargs(self, request: Dict[str, Any]) -> Dict[str, Any]:
+        if not getattr(self.config.server_args, "enable_streaming_session", False):
+            return {}
+        routing = request.get("routing") or {}
+        session_control = routing.get("session_control") or {}
+        session_id = session_control.get("session_id")
+        if not session_id:
+            return {}
+
+        # Streaming sessions only need the session identifier on each turn.
+        return {"session_params": {"id": session_id}}
+
     @staticmethod
     def _get_guided_decoding_params(
         guided_decoding: Optional[Dict[str, Any]],

@@ -305,6 +305,7 @@ async def generate(
                 external_trace_header=trace_header,
                 rid=trace_id,
                 data_parallel_rank=dp_rank,
+                **self._session_kwargs(request),
                 **logprob_kwargs,
                 **self._priority_kwargs(priority),
             )
@@ -338,6 +339,7 @@ async def generate(
                 external_trace_header=trace_header,
                 rid=trace_id,
                 data_parallel_rank=dp_rank,
+                **self._session_kwargs(request),
                 **logprob_kwargs,
                 **self._priority_kwargs(priority),
             )

@@ -157,6 +157,7 @@ async def generate(
             external_trace_header=trace_header,
             rid=trace_id,
             data_parallel_rank=dp_rank,
+            **self._session_kwargs(inner_request),
             **self._priority_kwargs(priority),
         )