Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3103,16 +3103,27 @@ def execute_model(
# self._draft_token_ids is None when `input_fits_in_drafter=False`
# and there is no draft tokens scheduled. so it need to update the
# spec_decoding info in scheduler_output with async_scheduling.
# use deepcopy to avoid the modification has influence on the
# use selective copy to avoid the modification has influence on the
# scheduler_output in engine core process.
# TODO(Ronald1995): deepcopy is expensive when there is a large
# number of requests, optimize it later.
# Optimized: Instead of expensive deepcopy, we shallow copy the dataclass
# and only deep copy the mutable dict fields that will be modified.
if (
self.use_async_scheduling
and self.num_spec_tokens
and self._draft_token_ids is None
):
scheduler_output = deepcopy(scheduler_output)
# Shallow copy the dataclass (copies all fields but not nested
# mutable objects)
scheduler_output = copy(scheduler_output)
# Deep copy only the mutable dict fields that will be modified:
# - num_scheduled_tokens: modified via dict[key] -= value
# - scheduled_spec_decode_tokens: modified via dict.pop()
scheduler_output.num_scheduled_tokens = (
scheduler_output.num_scheduled_tokens.copy()
)
scheduler_output.scheduled_spec_decode_tokens = (
scheduler_output.scheduled_spec_decode_tokens.copy()
)

num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
with (
Expand Down