diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md
index 1317f04f5273..5b35c9215b73 100644
--- a/docs/advanced_features/server_arguments.md
+++ b/docs/advanced_features/server_arguments.md
@@ -158,7 +158,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
| `--pp-max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | `None` | Type: int |
| `--pp-async-batch-depth` | The async batch depth of pipeline parallelism. | `0` | Type: int |
| `--stream-interval` | The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher | `1` | Type: int |
-| `--stream-output` | Whether to output as a sequence of disjoint segments. | `False` | bool flag (set to enable) |
+| `--incremental-streaming-output` | Whether to output as a sequence of disjoint segments. | `False` | bool flag (set to enable) |
| `--random-seed` | The random seed. | `None` | Type: int |
| `--constrained-json-whitespace-pattern` | (outlines and llguidance backends only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model to generate consecutive whitespaces, set the pattern to [\n\t ]* | `None` | Type: str |
| `--constrained-json-disable-any-whitespace` | (xgrammar and llguidance backends only) Enforce compact representation in JSON constrained output. | `False` | bool flag (set to enable) |
diff --git a/docs/platforms/ascend_npu_support_features.md b/docs/platforms/ascend_npu_support_features.md
index 1749f525374b..0dec8a312407 100644
--- a/docs/platforms/ascend_npu_support_features.md
+++ b/docs/platforms/ascend_npu_support_features.md
@@ -84,7 +84,7 @@ click [Server Arguments](https://docs.sglang.io/advanced_features/server_argumen
| `--pp-max-micro-batch-size` | `None` | Type: int | A2, A3 |
| `--pp-async-batch-depth` | `None` | Type: int | A2, A3 |
| `--stream-interval` | `1` | Type: int | A2, A3 |
-| `--stream-output` | `False` | bool flag (set to enable) | A2, A3 |
+| `--incremental-streaming-output` | `False` | bool flag (set to enable) | A2, A3 |
| `--random-seed` | `None` | Type: int | A2, A3 |
| `--constrained-json-`
`whitespace-pattern` | `None` | Type: str | A2, A3 |
| `--constrained-json-`
`disable-any-whitespace` | `False` | bool flag (set to enable) | A2, A3 |
diff --git a/python/sglang/srt/entrypoints/context.py b/python/sglang/srt/entrypoints/context.py
index 083e75f17ebf..dd6af3f8980d 100644
--- a/python/sglang/srt/entrypoints/context.py
+++ b/python/sglang/srt/entrypoints/context.py
@@ -199,13 +199,13 @@ def append_output(self, output) -> None:
completion_tokens is not None
and len(output_token_ids) == completion_tokens
):
- # Case 1: When --stream-output is not set.
+ # Case 1: When --incremental-streaming-output is not set.
# The output_ids contains all tokens generated so far.
# We only need to process the new tokens.
new_token_ids = output_token_ids[self.num_processed_tokens :]
self.num_processed_tokens = len(output_token_ids)
else:
- # Case 2: When --stream-output is set.
+ # Case 2: When --incremental-streaming-output is set.
# The output_ids contains only the new tokens.
new_token_ids = output_token_ids
self.num_processed_tokens += len(output_token_ids)
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index e78869f6b055..0e7e41a7948d 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -1574,7 +1574,7 @@ def _handle_batch_output(
state.text += recv_obj.output_strs[i]
# Not all request types have `stream` (e.g., EmbeddingReqInput). Default to non-streaming.
is_stream = getattr(state.obj, "stream", False)
- if self.server_args.stream_output and is_stream:
+ if self.server_args.incremental_streaming_output and is_stream:
state.output_ids.extend(recv_obj.output_ids[i])
output_token_ids = state.output_ids[state.last_output_offset :]
state.last_output_offset = len(state.output_ids)
@@ -1590,7 +1590,7 @@ def _handle_batch_output(
elif isinstance(recv_obj, BatchTokenIDOutput):
is_stream = getattr(state.obj, "stream", False)
- if self.server_args.stream_output and is_stream:
+ if self.server_args.incremental_streaming_output and is_stream:
state.output_ids.extend(recv_obj.output_ids[i])
output_token_ids = state.output_ids[state.last_output_offset :]
state.last_output_offset = len(state.output_ids)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index b5b11e562f5c..ba8d1d61beb8 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -367,7 +367,7 @@ class ServerArgs:
pp_max_micro_batch_size: Optional[int] = None
pp_async_batch_depth: int = 0
stream_interval: int = 1
- stream_output: bool = False
+ incremental_streaming_output: bool = False
enable_streaming_session: bool = False
random_seed: Optional[int] = None
constrained_json_whitespace_pattern: Optional[str] = None
@@ -3791,10 +3791,17 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher",
)
parser.add_argument(
- "--stream-output",
+ "--incremental-streaming-output",
action="store_true",
help="Whether to output as a sequence of disjoint segments.",
)
+ parser.add_argument(
+ "--stream-output",
+ action=DeprecatedStoreTrueAction,
+ dest="incremental_streaming_output",
+ new_flag="--incremental-streaming-output",
+ help="[Deprecated] Use --incremental-streaming-output instead.",
+ )
parser.add_argument(
"--enable-streaming-session",
action="store_true",
@@ -6284,6 +6291,32 @@ def __call__(self, parser, namespace, values, option_string=None):
)
+class DeprecatedStoreTrueAction(argparse.Action):
+ """Deprecated flag that still stores True and prints a warning."""
+
+ def __init__(
+ self,
+ option_strings,
+ dest,
+ new_flag=None,
+ nargs=0,
+ const=True,
+ default=False,
+ **kwargs,
+ ):
+ self.new_flag = new_flag
+ super().__init__(
+ option_strings, dest, nargs=nargs, const=const, default=default, **kwargs
+ )
+
+ def __call__(self, parser, namespace, values, option_string=None):
+ replacement = f" Use '{self.new_flag}' instead." if self.new_flag else ""
+ print_deprecated_warning(
+ f"'{option_string}' is deprecated and will be removed in a future release.{replacement}"
+ )
+ setattr(namespace, self.dest, True)
+
+
def auto_choose_speculative_params(self: ServerArgs):
"""
Automatically choose the parameters for speculative decoding.
diff --git a/test/manual/test_config_integration.py b/test/manual/test_config_integration.py
index 085315846248..a44bd979d3fc 100644
--- a/test/manual/test_config_integration.py
+++ b/test/manual/test_config_integration.py
@@ -31,7 +31,7 @@ def test_server_args_config_parser(merger):
"tensor-parallel-size": 2,
"trust-remote-code": False,
"enable-metrics": True,
- "stream-output": True,
+ "incremental-streaming-output": True,
"skip-server-warmup": False,
"log-requests": True,
"show-time-cost": True,
@@ -64,7 +64,7 @@ def test_server_args_config_parser(merger):
# Test boolean arguments
assert "--enable-metrics" in merged_args # True boolean
- assert "--stream-output" in merged_args # True boolean
+ assert "--incremental-streaming-output" in merged_args # True boolean
assert "--log-requests" in merged_args # True boolean
assert "--show-time-cost" in merged_args # True boolean
# False booleans should not be present (only add flag if True)
diff --git a/test/registered/tokenizer/test_skip_tokenizer_init.py b/test/registered/tokenizer/test_skip_tokenizer_init.py
index 7d95c19cf48f..852d8bd78cde 100644
--- a/test/registered/tokenizer/test_skip_tokenizer_init.py
+++ b/test/registered/tokenizer/test_skip_tokenizer_init.py
@@ -36,7 +36,7 @@ def setUpClass(cls):
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
- other_args=["--skip-tokenizer-init", "--stream-output"],
+ other_args=["--skip-tokenizer-init", "--incremental-streaming-output"],
)
cls.eos_token_id = [119690]
cls.tokenizer = AutoTokenizer.from_pretrained(