From 297bb34ead169dc2566ad36486250554b242b9f8 Mon Sep 17 00:00:00 2001 From: hnyls2002 Date: Sat, 14 Mar 2026 23:13:41 -0700 Subject: [PATCH] rename & deprecate --- docs/advanced_features/server_arguments.md | 2 +- docs/platforms/ascend_npu_support_features.md | 2 +- python/sglang/srt/entrypoints/context.py | 4 +- .../sglang/srt/managers/tokenizer_manager.py | 4 +- python/sglang/srt/server_args.py | 37 ++++++++++++++++++- test/manual/test_config_integration.py | 4 +- .../tokenizer/test_skip_tokenizer_init.py | 2 +- 7 files changed, 44 insertions(+), 11 deletions(-) diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md index 1317f04f5273..5b35c9215b73 100644 --- a/docs/advanced_features/server_arguments.md +++ b/docs/advanced_features/server_arguments.md @@ -158,7 +158,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--pp-max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | `None` | Type: int | | `--pp-async-batch-depth` | The async batch depth of pipeline parallelism. | `0` | Type: int | | `--stream-interval` | The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher | `1` | Type: int | -| `--stream-output` | Whether to output as a sequence of disjoint segments. | `False` | bool flag (set to enable) | +| `--incremental-streaming-output` | Whether to output as a sequence of disjoint segments. | `False` | bool flag (set to enable) | | `--random-seed` | The random seed. | `None` | Type: int | | `--constrained-json-whitespace-pattern` | (outlines and llguidance backends only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model to generate consecutive whitespaces, set the pattern to [\n\t ]* | `None` | Type: str | | `--constrained-json-disable-any-whitespace` | (xgrammar and llguidance backends only) Enforce compact representation in JSON constrained output. | `False` | bool flag (set to enable) | diff --git a/docs/platforms/ascend_npu_support_features.md b/docs/platforms/ascend_npu_support_features.md index 1749f525374b..0dec8a312407 100644 --- a/docs/platforms/ascend_npu_support_features.md +++ b/docs/platforms/ascend_npu_support_features.md @@ -84,7 +84,7 @@ click [Server Arguments](https://docs.sglang.io/advanced_features/server_argumen | `--pp-max-micro-batch-size` | `None` | Type: int | A2, A3 | | `--pp-async-batch-depth` | `None` | Type: int | A2, A3 | | `--stream-interval` | `1` | Type: int | A2, A3 | -| `--stream-output` | `False` | bool flag (set to enable) | A2, A3 | +| `--incremental-streaming-output` | `False` | bool flag (set to enable) | A2, A3 | | `--random-seed` | `None` | Type: int | A2, A3 | | `--constrained-json-`
`whitespace-pattern` | `None` | Type: str | A2, A3 | | `--constrained-json-`
`disable-any-whitespace` | `False` | bool flag (set to enable) | A2, A3 | diff --git a/python/sglang/srt/entrypoints/context.py b/python/sglang/srt/entrypoints/context.py index 083e75f17ebf..dd6af3f8980d 100644 --- a/python/sglang/srt/entrypoints/context.py +++ b/python/sglang/srt/entrypoints/context.py @@ -199,13 +199,13 @@ def append_output(self, output) -> None: completion_tokens is not None and len(output_token_ids) == completion_tokens ): - # Case 1: When --stream-output is not set. + # Case 1: When --incremental-streaming-output is not set. # The output_ids contains all tokens generated so far. # We only need to process the new tokens. new_token_ids = output_token_ids[self.num_processed_tokens :] self.num_processed_tokens = len(output_token_ids) else: - # Case 2: When --stream-output is set. + # Case 2: When --incremental-streaming-output is set. # The output_ids contains only the new tokens. new_token_ids = output_token_ids self.num_processed_tokens += len(output_token_ids) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index e78869f6b055..0e7e41a7948d 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -1574,7 +1574,7 @@ def _handle_batch_output( state.text += recv_obj.output_strs[i] # Not all request types have `stream` (e.g., EmbeddingReqInput). Default to non-streaming. is_stream = getattr(state.obj, "stream", False) - if self.server_args.stream_output and is_stream: + if self.server_args.incremental_streaming_output and is_stream: state.output_ids.extend(recv_obj.output_ids[i]) output_token_ids = state.output_ids[state.last_output_offset :] state.last_output_offset = len(state.output_ids) @@ -1590,7 +1590,7 @@ def _handle_batch_output( elif isinstance(recv_obj, BatchTokenIDOutput): is_stream = getattr(state.obj, "stream", False) - if self.server_args.stream_output and is_stream: + if self.server_args.incremental_streaming_output and is_stream: state.output_ids.extend(recv_obj.output_ids[i]) output_token_ids = state.output_ids[state.last_output_offset :] state.last_output_offset = len(state.output_ids) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index b5b11e562f5c..ba8d1d61beb8 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -367,7 +367,7 @@ class ServerArgs: pp_max_micro_batch_size: Optional[int] = None pp_async_batch_depth: int = 0 stream_interval: int = 1 - stream_output: bool = False + incremental_streaming_output: bool = False enable_streaming_session: bool = False random_seed: Optional[int] = None constrained_json_whitespace_pattern: Optional[str] = None @@ -3791,10 +3791,17 @@ def add_cli_args(parser: argparse.ArgumentParser): help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher", ) parser.add_argument( - "--stream-output", + "--incremental-streaming-output", action="store_true", help="Whether to output as a sequence of disjoint segments.", ) + parser.add_argument( + "--stream-output", + action=DeprecatedStoreTrueAction, + dest="incremental_streaming_output", + new_flag="--incremental-streaming-output", + help="[Deprecated] Use --incremental-streaming-output instead.", + ) parser.add_argument( "--enable-streaming-session", action="store_true", @@ -6284,6 +6291,32 @@ def __call__(self, parser, namespace, values, option_string=None): ) +class DeprecatedStoreTrueAction(argparse.Action): + """Deprecated flag that still stores True and prints a warning.""" + + def __init__( + self, + option_strings, + dest, + new_flag=None, + nargs=0, + const=True, + default=False, + **kwargs, + ): + self.new_flag = new_flag + super().__init__( + option_strings, dest, nargs=nargs, const=const, default=default, **kwargs + ) + + def __call__(self, parser, namespace, values, option_string=None): + replacement = f" Use '{self.new_flag}' instead." if self.new_flag else "" + print_deprecated_warning( + f"'{option_string}' is deprecated and will be removed in a future release.{replacement}" + ) + setattr(namespace, self.dest, True) + + def auto_choose_speculative_params(self: ServerArgs): """ Automatically choose the parameters for speculative decoding. diff --git a/test/manual/test_config_integration.py b/test/manual/test_config_integration.py index 085315846248..a44bd979d3fc 100644 --- a/test/manual/test_config_integration.py +++ b/test/manual/test_config_integration.py @@ -31,7 +31,7 @@ def test_server_args_config_parser(merger): "tensor-parallel-size": 2, "trust-remote-code": False, "enable-metrics": True, - "stream-output": True, + "incremental-streaming-output": True, "skip-server-warmup": False, "log-requests": True, "show-time-cost": True, @@ -64,7 +64,7 @@ def test_server_args_config_parser(merger): # Test boolean arguments assert "--enable-metrics" in merged_args # True boolean - assert "--stream-output" in merged_args # True boolean + assert "--incremental-streaming-output" in merged_args # True boolean assert "--log-requests" in merged_args # True boolean assert "--show-time-cost" in merged_args # True boolean # False booleans should not be present (only add flag if True) diff --git a/test/registered/tokenizer/test_skip_tokenizer_init.py b/test/registered/tokenizer/test_skip_tokenizer_init.py index 7d95c19cf48f..852d8bd78cde 100644 --- a/test/registered/tokenizer/test_skip_tokenizer_init.py +++ b/test/registered/tokenizer/test_skip_tokenizer_init.py @@ -36,7 +36,7 @@ def setUpClass(cls): cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--skip-tokenizer-init", "--stream-output"], + other_args=["--skip-tokenizer-init", "--incremental-streaming-output"], ) cls.eos_token_id = [119690] cls.tokenizer = AutoTokenizer.from_pretrained(