diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 93002012799a..fdd3326ff66c 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -159,6 +159,11 @@ class SchedulerConfig: structured outputs, speculative decoding, and pipeline parallelism. """ + include_finished_set: bool = False + """If set to True, a separate set of finished request ids will be included + in the EngineCoreOutputs returned by update_from_outputs(). + """ + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c058001ceb97..f3d8a907c0e1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -380,6 +380,8 @@ class EngineArgs: disable_hybrid_kv_cache_manager: bool = ( SchedulerConfig.disable_hybrid_kv_cache_manager) + include_finished_set: bool = (SchedulerConfig.include_finished_set) + guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback guided_decoding_disable_any_whitespace: bool = \ @@ -814,6 +816,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **scheduler_kwargs["disable_hybrid_kv_cache_manager"]) scheduler_group.add_argument("--async-scheduling", **scheduler_kwargs["async_scheduling"]) + scheduler_group.add_argument( + "--include-finished-set", + **scheduler_kwargs["include_finished_set"]) # vLLM arguments vllm_kwargs = get_kwargs(VllmConfig) @@ -1280,6 +1285,7 @@ def create_engine_config( disable_hybrid_kv_cache_manager=self. disable_hybrid_kv_cache_manager, async_scheduling=self.async_scheduling, + include_finished_set=self.include_finished_set, ) if not model_config.is_multimodal_model and self.default_mm_loras: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f92a3e43da1f..0359d73f2b27 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -121,7 +121,7 @@ def __init__(self, kv_cache_config=kv_cache_config, structured_output_manager=self.structured_output_manager, include_finished_set=vllm_config.parallel_config.data_parallel_size - > 1, + > 1 or vllm_config.scheduler_config.include_finished_set, log_stats=self.log_stats, )