From 1757e936a66dac5546896d1cc8446345f995f6a2 Mon Sep 17 00:00:00 2001 From: jellysnack Date: Thu, 26 Feb 2026 12:52:45 +0300 Subject: [PATCH 1/2] sglang guided decoding support Signed-off-by: jellysnack --- components/src/dynamo/sglang/args.py | 11 ++--------- components/src/dynamo/sglang/register.py | 4 ++-- .../sglang/request_handlers/handler_base.py | 18 +++++++++++++++--- .../request_handlers/llm/decode_handler.py | 8 +++++--- .../request_handlers/llm/diffusion_handler.py | 2 +- .../request_handlers/llm/prefill_handler.py | 1 + 6 files changed, 26 insertions(+), 18 deletions(-) diff --git a/components/src/dynamo/sglang/args.py b/components/src/dynamo/sglang/args.py index 0269d8aec6c2..166864f234e0 100644 --- a/components/src/dynamo/sglang/args.py +++ b/components/src/dynamo/sglang/args.py @@ -353,7 +353,6 @@ async def parse_args(args: list[str]) -> Config: server_args.served_model_name = parsed_args.served_model_name server_args.enable_metrics = getattr(parsed_args, "enable_metrics", False) server_args.log_level = getattr(parsed_args, "log_level", "info") - server_args.skip_tokenizer_init = True server_args.kv_events_config = getattr(parsed_args, "kv_events_config", None) server_args.tp_size = getattr(parsed_args, "tp_size", 1) server_args.dp_size = getattr(parsed_args, "dp_size", 1) @@ -374,15 +373,9 @@ async def parse_args(args: list[str]) -> Config: server_args.stream_output = True if dynamo_config.use_sglang_tokenizer: - logging.info( - "Using SGLang's built in tokenizer. Setting skip_tokenizer_init to False" - ) - server_args.skip_tokenizer_init = False + logging.info("Using SGLang's built in tokenizer") else: - logging.info( - "Using dynamo's built in tokenizer. Setting skip_tokenizer_init to True" - ) - server_args.skip_tokenizer_init = True + logging.info("Using dynamo's built in tokenizer") # Derive use_kv_events from server_args.kv_events_config # Check that kv_events_config exists AND publisher is not "null" ("zmq" or any future publishers) diff --git a/components/src/dynamo/sglang/register.py b/components/src/dynamo/sglang/register.py index 249117522201..55ce776cbf00 100644 --- a/components/src/dynamo/sglang/register.py +++ b/components/src/dynamo/sglang/register.py @@ -40,9 +40,9 @@ async def _register_model_with_runtime_config( runtime_config = await _get_runtime_config(engine, server_args, dynamo_args) input_type = input_type - if not server_args.skip_tokenizer_init: + if dynamo_args.use_sglang_tokenizer: logging.warning( - "The skip-tokenizer-init flag was not set. Using the sglang tokenizer/detokenizer instead. The dynamo tokenizer/detokenizer will not be used and only v1/chat/completions will be available" + "Using the sglang tokenizer/detokenizer instead. The dynamo tokenizer/detokenizer will not be used and only v1/chat/completions will be available" ) input_type = ModelInput.Text # Only override output_type for chat models, not for embeddings diff --git a/components/src/dynamo/sglang/request_handlers/handler_base.py b/components/src/dynamo/sglang/request_handlers/handler_base.py index df8f7f1a0163..2f1149726a4d 100644 --- a/components/src/dynamo/sglang/request_handlers/handler_base.py +++ b/components/src/dynamo/sglang/request_handlers/handler_base.py @@ -3,6 +3,7 @@ import asyncio import inspect +import json import logging import random import socket @@ -126,13 +127,13 @@ def __init__( self.metrics_publisher = None self.kv_publisher = None self.serving_mode = config.serving_mode - self.skip_tokenizer_init = config.server_args.skip_tokenizer_init + self.use_sglang_tokenizer = config.dynamo_args.use_sglang_tokenizer self.enable_trace = config.server_args.enable_trace if engine is not None: self.input_param_manager = InputParamManager( self.engine.tokenizer_manager.tokenizer - if not self.skip_tokenizer_init + if self.use_sglang_tokenizer else None ) self._engine_supports_priority = ( @@ -380,13 +381,24 @@ def cleanup(self) -> None: def _get_input_param(self, request: Dict[str, Any]) -> Dict[str, Any]: request_input = self.input_param_manager.get_input_param( - request, use_tokenizer=not self.skip_tokenizer_init + request, use_tokenizer=self.use_sglang_tokenizer ) return { "prompt" if isinstance(request_input, str) else "input_ids": request_input } + @staticmethod + def _get_guided_decoding_params( + guided_decoding: Optional[Dict[str, Any]], + ) -> Dict[str, Any]: + """Extract guided decoding params (e.g. json_schema) for SGLang sampling_params.""" + if isinstance(guided_decoding, dict): + json_schema = guided_decoding.get("json") + if json_schema is not None: + return {"json_schema": json.dumps(json_schema)} + return {} + @staticmethod def _generate_bootstrap_room() -> int: """Generate a unique bootstrap room ID for disaggregated serving. diff --git a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py index 33e24df9dbea..43f6629f67e9 100644 --- a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py +++ b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py @@ -66,7 +66,7 @@ def _build_sampling_params(self, request: Dict[str, Any]) -> Dict[str, Any]: Returns: Dict of sampling parameters for SGLang engine. """ - if self.skip_tokenizer_init: + if not self.use_sglang_tokenizer: # Token-based request format sampling_opts = request.get("sampling_options", {}) stop_conditions = request.get("stop_conditions", {}) @@ -77,6 +77,7 @@ def _build_sampling_params(self, request: Dict[str, Any]) -> Dict[str, Any]: "top_k": sampling_opts.get("top_k"), "max_new_tokens": stop_conditions.get("max_tokens"), "ignore_eos": stop_conditions.get("ignore_eos"), + **self._get_guided_decoding_params(sampling_opts.get("guided_decoding")), } else: # OpenAI request format @@ -85,6 +86,7 @@ def _build_sampling_params(self, request: Dict[str, Any]) -> Dict[str, Any]: "top_p": request.get("top_p"), "top_k": request.get("top_k"), "max_new_tokens": request.get("max_tokens"), + **self._get_guided_decoding_params(request.get("guided_decoding")), } return {k: v for k, v in param_mapping.items() if v is not None} @@ -151,7 +153,7 @@ async def generate( **self._priority_kwargs(priority), ) - if self.skip_tokenizer_init: + if not self.use_sglang_tokenizer: async for out in self._process_token_stream(decode, context): yield out else: @@ -190,7 +192,7 @@ async def generate( data_parallel_rank=dp_rank, **self._priority_kwargs(priority), ) - if self.skip_tokenizer_init: + if not self.use_sglang_tokenizer: async for out in self._process_token_stream(agg, context): yield out else: diff --git a/components/src/dynamo/sglang/request_handlers/llm/diffusion_handler.py b/components/src/dynamo/sglang/request_handlers/llm/diffusion_handler.py index a8997757479b..6675faa9a808 100644 --- a/components/src/dynamo/sglang/request_handlers/llm/diffusion_handler.py +++ b/components/src/dynamo/sglang/request_handlers/llm/diffusion_handler.py @@ -88,7 +88,7 @@ async def generate( ) # Process stream output (token-based or text-based) - if self.skip_tokenizer_init: + if not self.use_sglang_tokenizer: async for out in self._process_token_stream(async_gen, context): yield out else: diff --git a/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py b/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py index 05f712f12074..bb9f4a0489e9 100644 --- a/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py +++ b/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py @@ -81,6 +81,7 @@ async def generate( "top_p": sampling_opts.get("top_p"), "top_k": sampling_opts.get("top_k"), "max_new_tokens": stop_conditions.get("max_tokens"), + **self._get_guided_decoding_params(sampling_opts.get("guided_decoding")), } sampling_params = { k: v for k, v in sampling_params.items() if v is not None From 7390085366cbf4da703e63ec5449b0e599d93a47 Mon Sep 17 00:00:00 2001 From: jellysnack Date: Thu, 26 Feb 2026 12:53:25 +0300 Subject: [PATCH 2/2] sglang guided decoding support Signed-off-by: jellysnack --- .../src/dynamo/sglang/request_handlers/llm/decode_handler.py | 4 +++- .../src/dynamo/sglang/request_handlers/llm/prefill_handler.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py index 43f6629f67e9..7da95d60092a 100644 --- a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py +++ b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py @@ -77,7 +77,9 @@ def _build_sampling_params(self, request: Dict[str, Any]) -> Dict[str, Any]: "top_k": sampling_opts.get("top_k"), "max_new_tokens": stop_conditions.get("max_tokens"), "ignore_eos": stop_conditions.get("ignore_eos"), - **self._get_guided_decoding_params(sampling_opts.get("guided_decoding")), + **self._get_guided_decoding_params( + sampling_opts.get("guided_decoding") + ), } else: # OpenAI request format diff --git a/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py b/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py index bb9f4a0489e9..0f39a0ed89c8 100644 --- a/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py +++ b/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py @@ -81,7 +81,9 @@ async def generate( "top_p": sampling_opts.get("top_p"), "top_k": sampling_opts.get("top_k"), "max_new_tokens": stop_conditions.get("max_tokens"), - **self._get_guided_decoding_params(sampling_opts.get("guided_decoding")), + **self._get_guided_decoding_params( + sampling_opts.get("guided_decoding") + ), } sampling_params = { k: v for k, v in sampling_params.items() if v is not None