From fe9ae0c296b56e60f34e598de2ab96e5c9408e47 Mon Sep 17 00:00:00 2001 From: Weiliangl User Date: Wed, 1 Apr 2026 07:33:52 +0000 Subject: [PATCH 1/5] fix(sglang): compat old and new streaming field Signed-off-by: Weiliangl User --- components/src/dynamo/sglang/_compat.py | 35 +++++++++++++++++++++++++ components/src/dynamo/sglang/args.py | 11 ++++---- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/components/src/dynamo/sglang/_compat.py b/components/src/dynamo/sglang/_compat.py index 6cc79414a0d7..8c556b0d7298 100644 --- a/components/src/dynamo/sglang/_compat.py +++ b/components/src/dynamo/sglang/_compat.py @@ -18,6 +18,7 @@ import ipaddress import logging import socket +from typing import Any logger = logging.getLogger(__name__) @@ -98,8 +99,42 @@ def to_tcp(self) -> str: return f"tcp://{self.host}:{self.port}" +def enable_disjoint_streaming_output(server_args: Any) -> None: + """ + Enable SGLang's disjoint streaming output across ServerArgs field renames. + + Covers sglang <= 0.5.x (`stream_output`) and newer releases + (`incremental_streaming_output`). + """ + fields = getattr(type(server_args), "__dataclass_fields__", None) + if isinstance(fields, dict): + if "incremental_streaming_output" in fields: + server_args.incremental_streaming_output = True + return + if "stream_output" in fields: + server_args.stream_output = True + return + raise AttributeError( + "SGLang ServerArgs has neither 'incremental_streaming_output' nor " + "'stream_output'" + ) + + if hasattr(server_args, "incremental_streaming_output"): + server_args.incremental_streaming_output = True + return + if hasattr(server_args, "stream_output"): + server_args.stream_output = True + return + + logger.debug( + "Skipping streaming output compatibility for non-ServerArgs object: %s", + type(server_args).__name__, + ) + + __all__ = [ "NetworkAddress", + "enable_disjoint_streaming_output", "get_local_ip_auto", "get_zmq_socket", "_SGLANG_HAS_NETWORK_MODULE", diff --git a/components/src/dynamo/sglang/args.py b/components/src/dynamo/sglang/args.py index c3241c930e62..a4b225d95b72 100644 --- a/components/src/dynamo/sglang/args.py +++ b/components/src/dynamo/sglang/args.py @@ -25,6 +25,7 @@ from dynamo.llm import fetch_model from dynamo.runtime.logging import configure_dynamo_logging from dynamo.sglang.backend_args import DynamoSGLangArgGroup, DynamoSGLangConfig +from dynamo.sglang._compat import enable_disjoint_streaming_output configure_dynamo_logging() @@ -374,12 +375,10 @@ async def parse_args(args: list[str]) -> Config: ) # Dynamo's streaming handlers expect disjoint output_ids from SGLang (only new - # tokens since last output), not cumulative tokens. - # sglang renamed stream_output -> incremental_streaming_output in PR #20614. - if hasattr(ServerArgs, "incremental_streaming_output"): - server_args.incremental_streaming_output = True - else: - server_args.stream_output = True + # tokens since last output), not cumulative tokens. Modern SGLang gates this + # behavior behind incremental_streaming_output, while older releases used + # stream_output. + enable_disjoint_streaming_output(server_args) if dynamo_config.use_sglang_tokenizer: warnings.warn( From 918f55513033b7579ec642d63f5b4f572b042a79 Mon Sep 17 00:00:00 2001 From: Weiliangl User Date: Wed, 1 Apr 2026 07:38:50 +0000 Subject: [PATCH 2/5] fix(llm): preserve completion token details Signed-off-by: Weiliangl User --- lib/llm/src/protocols/openai/completions/delta.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/llm/src/protocols/openai/completions/delta.rs b/lib/llm/src/protocols/openai/completions/delta.rs index bc92bafddc55..f40f5105aedb 100644 --- a/lib/llm/src/protocols/openai/completions/delta.rs +++ b/lib/llm/src/protocols/openai/completions/delta.rs @@ -284,6 +284,11 @@ impl crate::protocols::openai::DeltaGeneratorExt for // Update prompt_tokens from worker if provided (e.g., for embeddings) self.usage.prompt_tokens = completion_usage.prompt_tokens; + // Propagate completion token details if provided + if let Some(completion_details) = completion_usage.completion_tokens_details.as_ref() { + self.usage.completion_tokens_details = Some(completion_details.clone()); + } + // Propagate prompt token details if provided if let Some(prompt_details) = completion_usage.prompt_tokens_details.as_ref() { self.usage.prompt_tokens_details = Some(prompt_details.clone()); From e7e612f4e9f7a3552b9fad6c02baff97bae36537 Mon Sep 17 00:00:00 2001 From: Weiliangl User Date: Thu, 2 Apr 2026 02:34:29 +0000 Subject: [PATCH 3/5] fix(llm): restore backend completion token usage Signed-off-by: Weiliangl User --- lib/llm/src/protocols/openai/completions/delta.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/llm/src/protocols/openai/completions/delta.rs b/lib/llm/src/protocols/openai/completions/delta.rs index f40f5105aedb..072eccc38efe 100644 --- a/lib/llm/src/protocols/openai/completions/delta.rs +++ b/lib/llm/src/protocols/openai/completions/delta.rs @@ -283,6 +283,7 @@ impl crate::protocols::openai::DeltaGeneratorExt for if let Some(completion_usage) = delta.completion_usage.as_ref() { // Update prompt_tokens from worker if provided (e.g., for embeddings) self.usage.prompt_tokens = completion_usage.prompt_tokens; + self.usage.completion_tokens = completion_usage.completion_tokens; // Propagate completion token details if provided if let Some(completion_details) = completion_usage.completion_tokens_details.as_ref() { From 4e19428cc264ffd24604d22af77d16dd698a19f4 Mon Sep 17 00:00:00 2001 From: Weiliangl User Date: Fri, 3 Apr 2026 02:09:23 +0000 Subject: [PATCH 4/5] fix(llm): keep aggregated completion token usage Signed-off-by: Weiliangl User --- lib/llm/src/protocols/openai/completions/delta.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/llm/src/protocols/openai/completions/delta.rs b/lib/llm/src/protocols/openai/completions/delta.rs index 072eccc38efe..f40f5105aedb 100644 --- a/lib/llm/src/protocols/openai/completions/delta.rs +++ b/lib/llm/src/protocols/openai/completions/delta.rs @@ -283,7 +283,6 @@ impl crate::protocols::openai::DeltaGeneratorExt for if let Some(completion_usage) = delta.completion_usage.as_ref() { // Update prompt_tokens from worker if provided (e.g., for embeddings) self.usage.prompt_tokens = completion_usage.prompt_tokens; - self.usage.completion_tokens = completion_usage.completion_tokens; // Propagate completion token details if provided if let Some(completion_details) = completion_usage.completion_tokens_details.as_ref() { From e9239fc195fea22f3236c286ec8d26a8ce8a4a52 Mon Sep 17 00:00:00 2001 From: Weiliangl User Date: Fri, 3 Apr 2026 02:20:36 +0000 Subject: [PATCH 5/5] style(sglang): sort args imports Signed-off-by: Weiliangl User --- components/src/dynamo/sglang/args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/src/dynamo/sglang/args.py b/components/src/dynamo/sglang/args.py index a4b225d95b72..931227edd266 100644 --- a/components/src/dynamo/sglang/args.py +++ b/components/src/dynamo/sglang/args.py @@ -24,8 +24,8 @@ from dynamo.common.utils.runtime import parse_endpoint from dynamo.llm import fetch_model from dynamo.runtime.logging import configure_dynamo_logging -from dynamo.sglang.backend_args import DynamoSGLangArgGroup, DynamoSGLangConfig from dynamo.sglang._compat import enable_disjoint_streaming_output +from dynamo.sglang.backend_args import DynamoSGLangArgGroup, DynamoSGLangConfig configure_dynamo_logging()