Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions components/src/dynamo/sglang/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import ipaddress
import logging
import socket
from typing import Any

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -98,8 +99,42 @@ def to_tcp(self) -> str:
return f"tcp://{self.host}:{self.port}"


def enable_disjoint_streaming_output(server_args: Any) -> None:
"""
Enable SGLang's disjoint streaming output across ServerArgs field renames.

Covers sglang <= 0.5.x (`stream_output`) and newer releases
(`incremental_streaming_output`).
"""
fields = getattr(type(server_args), "__dataclass_fields__", None)
if isinstance(fields, dict):
if "incremental_streaming_output" in fields:
server_args.incremental_streaming_output = True
return
if "stream_output" in fields:
server_args.stream_output = True
return
raise AttributeError(
"SGLang ServerArgs has neither 'incremental_streaming_output' nor "
"'stream_output'"
)

if hasattr(server_args, "incremental_streaming_output"):
server_args.incremental_streaming_output = True
return
if hasattr(server_args, "stream_output"):
server_args.stream_output = True
return

logger.debug(
"Skipping streaming output compatibility for non-ServerArgs object: %s",
type(server_args).__name__,
)


__all__ = [
"NetworkAddress",
"enable_disjoint_streaming_output",
"get_local_ip_auto",
"get_zmq_socket",
"_SGLANG_HAS_NETWORK_MODULE",
Expand Down
11 changes: 5 additions & 6 deletions components/src/dynamo/sglang/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from dynamo.common.utils.runtime import parse_endpoint
from dynamo.llm import fetch_model
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.sglang._compat import enable_disjoint_streaming_output
from dynamo.sglang.backend_args import DynamoSGLangArgGroup, DynamoSGLangConfig

configure_dynamo_logging()
Expand Down Expand Up @@ -374,12 +375,10 @@ async def parse_args(args: list[str]) -> Config:
)

# Dynamo's streaming handlers expect disjoint output_ids from SGLang (only new
# tokens since last output), not cumulative tokens.
# sglang renamed stream_output -> incremental_streaming_output in PR #20614.
if hasattr(ServerArgs, "incremental_streaming_output"):
server_args.incremental_streaming_output = True
else:
server_args.stream_output = True
# tokens since last output), not cumulative tokens. Modern SGLang gates this
# behavior behind incremental_streaming_output, while older releases used
# stream_output.
enable_disjoint_streaming_output(server_args)

if dynamo_config.use_sglang_tokenizer:
warnings.warn(
Expand Down
5 changes: 5 additions & 0 deletions lib/llm/src/protocols/openai/completions/delta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,11 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateCompletionResponse> for
// Update prompt_tokens from worker if provided (e.g., for embeddings)
self.usage.prompt_tokens = completion_usage.prompt_tokens;

// Propagate completion token details if provided
if let Some(completion_details) = completion_usage.completion_tokens_details.as_ref() {
self.usage.completion_tokens_details = Some(completion_details.clone());
}

// Propagate prompt token details if provided
if let Some(prompt_details) = completion_usage.prompt_tokens_details.as_ref() {
self.usage.prompt_tokens_details = Some(prompt_details.clone());
Expand Down
Loading