From 297bb34ead169dc2566ad36486250554b242b9f8 Mon Sep 17 00:00:00 2001
From: hnyls2002 <lsyincs@gmail.com>
Date: Sat, 14 Mar 2026 23:13:41 -0700
Subject: [PATCH] rename & deprecate

---
 docs/advanced_features/server_arguments.md    |  2 +-
 docs/platforms/ascend_npu_support_features.md |  2 +-
 python/sglang/srt/entrypoints/context.py      |  4 +-
 .../sglang/srt/managers/tokenizer_manager.py  |  4 +-
 python/sglang/srt/server_args.py              | 37 ++++++++++++++++++-
 test/manual/test_config_integration.py        |  4 +-
 .../tokenizer/test_skip_tokenizer_init.py     |  2 +-
 7 files changed, 44 insertions(+), 11 deletions(-)
diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md
index 1317f04f5273..5b35c9215b73 100644
--- a/docs/advanced_features/server_arguments.md
+++ b/docs/advanced_features/server_arguments.md
@@ -158,7 +158,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
 | `--pp-max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | `None` | Type: int |
 | `--pp-async-batch-depth` | The async batch depth of pipeline parallelism. | `0` | Type: int |
 | `--stream-interval` | The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher | `1` | Type: int |
-| `--stream-output` | Whether to output as a sequence of disjoint segments. | `False` | bool flag (set to enable) |
+| `--incremental-streaming-output` | Whether to output as a sequence of disjoint segments. | `False` | bool flag (set to enable) |
 | `--random-seed` | The random seed. | `None` | Type: int |
 | `--constrained-json-whitespace-pattern` | (outlines and llguidance backends only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model to generate consecutive whitespaces, set the pattern to [\n\t ]* | `None` | Type: str |
 | `--constrained-json-disable-any-whitespace` | (xgrammar and llguidance backends only) Enforce compact representation in JSON constrained output. | `False` | bool flag (set to enable) |
diff --git a/docs/platforms/ascend_npu_support_features.md b/docs/platforms/ascend_npu_support_features.md
index 1749f525374b..0dec8a312407 100644
--- a/docs/platforms/ascend_npu_support_features.md
+++ b/docs/platforms/ascend_npu_support_features.md
@@ -84,7 +84,7 @@ click [Server Arguments](https://docs.sglang.io/advanced_features/server_argumen
 | `--pp-max-micro-batch-size`                        | `None`   | Type: int                 |      A2, A3      |
 | `--pp-async-batch-depth`                           | `None`   | Type: int                 |      A2, A3      |
 | `--stream-interval`                                | `1`      | Type: int                 |      A2, A3      |
-| `--stream-output`                                  | `False`  | bool flag (set to enable) |      A2, A3      |
+| `--incremental-streaming-output`                   | `False`  | bool flag (set to enable) |      A2, A3      |
 | `--random-seed`                                    | `None`   | Type: int                 |      A2, A3      |
 | `--constrained-json-`<br/>`whitespace-pattern`     | `None`   | Type: str                 |      A2, A3      |
 | `--constrained-json-`<br/>`disable-any-whitespace` | `False`  | bool flag (set to enable) |      A2, A3      |
diff --git a/python/sglang/srt/entrypoints/context.py b/python/sglang/srt/entrypoints/context.py
index 083e75f17ebf..dd6af3f8980d 100644
--- a/python/sglang/srt/entrypoints/context.py
+++ b/python/sglang/srt/entrypoints/context.py
@@ -199,13 +199,13 @@ def append_output(self, output) -> None:
                 completion_tokens is not None
                 and len(output_token_ids) == completion_tokens
             ):
-                # Case 1: When --stream-output is not set.
+                # Case 1: When --incremental-streaming-output is not set.
                 # The output_ids contains all tokens generated so far.
                 # We only need to process the new tokens.
                 new_token_ids = output_token_ids[self.num_processed_tokens :]
                 self.num_processed_tokens = len(output_token_ids)
             else:
-                # Case 2: When --stream-output is set.
+                # Case 2: When --incremental-streaming-output is set.
                 # The output_ids contains only the new tokens.
                 new_token_ids = output_token_ids
                 self.num_processed_tokens += len(output_token_ids)
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index e78869f6b055..0e7e41a7948d 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -1574,7 +1574,7 @@ def _handle_batch_output(
                 state.text += recv_obj.output_strs[i]
                 # Not all request types have `stream` (e.g., EmbeddingReqInput). Default to non-streaming.
                 is_stream = getattr(state.obj, "stream", False)
-                if self.server_args.stream_output and is_stream:
+                if self.server_args.incremental_streaming_output and is_stream:
                     state.output_ids.extend(recv_obj.output_ids[i])
                     output_token_ids = state.output_ids[state.last_output_offset :]
                     state.last_output_offset = len(state.output_ids)
@@ -1590,7 +1590,7 @@ def _handle_batch_output(
 
             elif isinstance(recv_obj, BatchTokenIDOutput):
                 is_stream = getattr(state.obj, "stream", False)
-                if self.server_args.stream_output and is_stream:
+                if self.server_args.incremental_streaming_output and is_stream:
                     state.output_ids.extend(recv_obj.output_ids[i])
                     output_token_ids = state.output_ids[state.last_output_offset :]
                     state.last_output_offset = len(state.output_ids)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index b5b11e562f5c..ba8d1d61beb8 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -367,7 +367,7 @@ class ServerArgs:
     pp_max_micro_batch_size: Optional[int] = None
     pp_async_batch_depth: int = 0
     stream_interval: int = 1
-    stream_output: bool = False
+    incremental_streaming_output: bool = False
     enable_streaming_session: bool = False
     random_seed: Optional[int] = None
     constrained_json_whitespace_pattern: Optional[str] = None
@@ -3791,10 +3791,17 @@ def add_cli_args(parser: argparse.ArgumentParser):
             help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher",
         )
         parser.add_argument(
-            "--stream-output",
+            "--incremental-streaming-output",
             action="store_true",
             help="Whether to output as a sequence of disjoint segments.",
         )
+        parser.add_argument(
+            "--stream-output",
+            action=DeprecatedStoreTrueAction,
+            dest="incremental_streaming_output",
+            new_flag="--incremental-streaming-output",
+            help="[Deprecated] Use --incremental-streaming-output instead.",
+        )
         parser.add_argument(
             "--enable-streaming-session",
             action="store_true",
@@ -6284,6 +6291,32 @@ def __call__(self, parser, namespace, values, option_string=None):
         )
 
 
+class DeprecatedStoreTrueAction(argparse.Action):
+    """Deprecated flag that still stores True and prints a warning."""
+
+    def __init__(
+        self,
+        option_strings,
+        dest,
+        new_flag=None,
+        nargs=0,
+        const=True,
+        default=False,
+        **kwargs,
+    ):
+        self.new_flag = new_flag
+        super().__init__(
+            option_strings, dest, nargs=nargs, const=const, default=default, **kwargs
+        )
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        replacement = f" Use '{self.new_flag}' instead." if self.new_flag else ""
+        print_deprecated_warning(
+            f"'{option_string}' is deprecated and will be removed in a future release.{replacement}"
+        )
+        setattr(namespace, self.dest, True)
+
+
 def auto_choose_speculative_params(self: ServerArgs):
     """
     Automatically choose the parameters for speculative decoding.
diff --git a/test/manual/test_config_integration.py b/test/manual/test_config_integration.py
index 085315846248..a44bd979d3fc 100644
--- a/test/manual/test_config_integration.py
+++ b/test/manual/test_config_integration.py
@@ -31,7 +31,7 @@ def test_server_args_config_parser(merger):
         "tensor-parallel-size": 2,
         "trust-remote-code": False,
         "enable-metrics": True,
-        "stream-output": True,
+        "incremental-streaming-output": True,
         "skip-server-warmup": False,
         "log-requests": True,
         "show-time-cost": True,
@@ -64,7 +64,7 @@ def test_server_args_config_parser(merger):
 
         # Test boolean arguments
         assert "--enable-metrics" in merged_args  # True boolean
-        assert "--stream-output" in merged_args  # True boolean
+        assert "--incremental-streaming-output" in merged_args  # True boolean
         assert "--log-requests" in merged_args  # True boolean
         assert "--show-time-cost" in merged_args  # True boolean
         # False booleans should not be present (only add flag if True)
diff --git a/test/registered/tokenizer/test_skip_tokenizer_init.py b/test/registered/tokenizer/test_skip_tokenizer_init.py
index 7d95c19cf48f..852d8bd78cde 100644
--- a/test/registered/tokenizer/test_skip_tokenizer_init.py
+++ b/test/registered/tokenizer/test_skip_tokenizer_init.py
@@ -36,7 +36,7 @@ def setUpClass(cls):
             cls.model,
             cls.base_url,
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--skip-tokenizer-init", "--stream-output"],
+            other_args=["--skip-tokenizer-init", "--incremental-streaming-output"],
         )
         cls.eos_token_id = [119690]
         cls.tokenizer = AutoTokenizer.from_pretrained(