Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/advanced_features/server_arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
| `--pp-max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | `None` | Type: int |
| `--pp-async-batch-depth` | The async batch depth of pipeline parallelism. | `0` | Type: int |
| `--stream-interval` | The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher | `1` | Type: int |
| `--stream-output` | Whether to output as a sequence of disjoint segments. | `False` | bool flag (set to enable) |
| `--incremental-streaming-output` | Whether to output as a sequence of disjoint segments. | `False` | bool flag (set to enable) |
| `--random-seed` | The random seed. | `None` | Type: int |
| `--constrained-json-whitespace-pattern` | (outlines and llguidance backends only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model to generate consecutive whitespaces, set the pattern to [\n\t ]* | `None` | Type: str |
| `--constrained-json-disable-any-whitespace` | (xgrammar and llguidance backends only) Enforce compact representation in JSON constrained output. | `False` | bool flag (set to enable) |
Expand Down
2 changes: 1 addition & 1 deletion docs/platforms/ascend_npu_support_features.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ click [Server Arguments](https://docs.sglang.io/advanced_features/server_argumen
| `--pp-max-micro-batch-size` | `None` | Type: int | A2, A3 |
| `--pp-async-batch-depth` | `None` | Type: int | A2, A3 |
| `--stream-interval` | `1` | Type: int | A2, A3 |
| `--stream-output` | `False` | bool flag (set to enable) | A2, A3 |
| `--incremental-streaming-output` | `False` | bool flag (set to enable) | A2, A3 |
| `--random-seed` | `None` | Type: int | A2, A3 |
| `--constrained-json-`<br/>`whitespace-pattern` | `None` | Type: str | A2, A3 |
| `--constrained-json-`<br/>`disable-any-whitespace` | `False` | bool flag (set to enable) | A2, A3 |
Expand Down
4 changes: 2 additions & 2 deletions python/sglang/srt/entrypoints/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,13 @@ def append_output(self, output) -> None:
completion_tokens is not None
and len(output_token_ids) == completion_tokens
):
# Case 1: When --stream-output is not set.
# Case 1: When --incremental-streaming-output is not set.
# The output_ids contains all tokens generated so far.
# We only need to process the new tokens.
new_token_ids = output_token_ids[self.num_processed_tokens :]
self.num_processed_tokens = len(output_token_ids)
else:
# Case 2: When --stream-output is set.
# Case 2: When --incremental-streaming-output is set.
# The output_ids contains only the new tokens.
new_token_ids = output_token_ids
self.num_processed_tokens += len(output_token_ids)
Expand Down
4 changes: 2 additions & 2 deletions python/sglang/srt/managers/tokenizer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1574,7 +1574,7 @@ def _handle_batch_output(
state.text += recv_obj.output_strs[i]
# Not all request types have `stream` (e.g., EmbeddingReqInput). Default to non-streaming.
is_stream = getattr(state.obj, "stream", False)
if self.server_args.stream_output and is_stream:
if self.server_args.incremental_streaming_output and is_stream:
state.output_ids.extend(recv_obj.output_ids[i])
output_token_ids = state.output_ids[state.last_output_offset :]
state.last_output_offset = len(state.output_ids)
Expand All @@ -1590,7 +1590,7 @@ def _handle_batch_output(

elif isinstance(recv_obj, BatchTokenIDOutput):
is_stream = getattr(state.obj, "stream", False)
if self.server_args.stream_output and is_stream:
if self.server_args.incremental_streaming_output and is_stream:
state.output_ids.extend(recv_obj.output_ids[i])
output_token_ids = state.output_ids[state.last_output_offset :]
state.last_output_offset = len(state.output_ids)
Expand Down
37 changes: 35 additions & 2 deletions python/sglang/srt/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ class ServerArgs:
pp_max_micro_batch_size: Optional[int] = None
pp_async_batch_depth: int = 0
stream_interval: int = 1
stream_output: bool = False
incremental_streaming_output: bool = False
enable_streaming_session: bool = False
random_seed: Optional[int] = None
constrained_json_whitespace_pattern: Optional[str] = None
Expand Down Expand Up @@ -3791,10 +3791,17 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher",
)
parser.add_argument(
"--stream-output",
"--incremental-streaming-output",
action="store_true",
help="Whether to output as a sequence of disjoint segments.",
)
parser.add_argument(
"--stream-output",
action=DeprecatedStoreTrueAction,
dest="incremental_streaming_output",
new_flag="--incremental-streaming-output",
help="[Deprecated] Use --incremental-streaming-output instead.",
)
parser.add_argument(
"--enable-streaming-session",
action="store_true",
Expand Down Expand Up @@ -6284,6 +6291,32 @@ def __call__(self, parser, namespace, values, option_string=None):
)


class DeprecatedStoreTrueAction(argparse.Action):
"""Deprecated flag that still stores True and prints a warning."""

def __init__(
self,
option_strings,
dest,
new_flag=None,
nargs=0,
const=True,
default=False,
**kwargs,
):
self.new_flag = new_flag
super().__init__(
option_strings, dest, nargs=nargs, const=const, default=default, **kwargs
)

def __call__(self, parser, namespace, values, option_string=None):
replacement = f" Use '{self.new_flag}' instead." if self.new_flag else ""
print_deprecated_warning(
f"'{option_string}' is deprecated and will be removed in a future release.{replacement}"
)
setattr(namespace, self.dest, True)


def auto_choose_speculative_params(self: ServerArgs):
"""
Automatically choose the parameters for speculative decoding.
Expand Down
4 changes: 2 additions & 2 deletions test/manual/test_config_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_server_args_config_parser(merger):
"tensor-parallel-size": 2,
"trust-remote-code": False,
"enable-metrics": True,
"stream-output": True,
"incremental-streaming-output": True,
"skip-server-warmup": False,
"log-requests": True,
"show-time-cost": True,
Expand Down Expand Up @@ -64,7 +64,7 @@ def test_server_args_config_parser(merger):

# Test boolean arguments
assert "--enable-metrics" in merged_args # True boolean
assert "--stream-output" in merged_args # True boolean
assert "--incremental-streaming-output" in merged_args # True boolean
assert "--log-requests" in merged_args # True boolean
assert "--show-time-cost" in merged_args # True boolean
# False booleans should not be present (only add flag if True)
Expand Down
2 changes: 1 addition & 1 deletion test/registered/tokenizer/test_skip_tokenizer_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def setUpClass(cls):
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=["--skip-tokenizer-init", "--stream-output"],
other_args=["--skip-tokenizer-init", "--incremental-streaming-output"],
)
cls.eos_token_id = [119690]
cls.tokenizer = AutoTokenizer.from_pretrained(
Expand Down
Loading