diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 74006dfb7655..442484a10e0e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3103,16 +3103,27 @@ def execute_model( # self._draft_token_ids is None when `input_fits_in_drafter=False` # and there is no draft tokens scheduled. so it need to update the # spec_decoding info in scheduler_output with async_scheduling. - # use deepcopy to avoid the modification has influence on the + # use selective copy to avoid the modification has influence on the # scheduler_output in engine core process. - # TODO(Ronald1995): deepcopy is expensive when there is a large - # number of requests, optimize it later. + # Optimized: Instead of expensive deepcopy, we shallow copy the dataclass + # and only deep copy the mutable dict fields that will be modified. if ( self.use_async_scheduling and self.num_spec_tokens and self._draft_token_ids is None ): - scheduler_output = deepcopy(scheduler_output) + # Shallow copy the dataclass (copies all fields but not nested + # mutable objects) + scheduler_output = copy(scheduler_output) + # Deep copy only the mutable dict fields that will be modified: + # - num_scheduled_tokens: modified via dict[key] -= value + # - scheduled_spec_decode_tokens: modified via dict.pop() + scheduler_output.num_scheduled_tokens = ( + scheduler_output.num_scheduled_tokens.copy() + ) + scheduler_output.scheduled_spec_decode_tokens = ( + scheduler_output.scheduled_spec_decode_tokens.copy() + ) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens with (