From 303065140450ca21d63c2271b75de31f91ab44d8 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Wed, 17 Dec 2025 09:28:55 +0000 Subject: [PATCH] [feat][mm]optimize encoder cache manager by operating with embedding only Signed-off-by: 01267596 --- vllm_ascend/core/recompute_scheduler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/core/recompute_scheduler.py b/vllm_ascend/core/recompute_scheduler.py index 48aa67a272e..356583a3edf 100644 --- a/vllm_ascend/core/recompute_scheduler.py +++ b/vllm_ascend/core/recompute_scheduler.py @@ -235,10 +235,10 @@ def schedule(self) -> SchedulerOutput: if preempted_encoder_inputs: # Restore encoder compute budget if the preempted # request had encoder inputs scheduled in this step. - num_tokens_to_restore = sum( - preempted_req.get_num_encoder_tokens(i) + num_embeds_to_restore = sum( + preempted_req.get_num_encoder_embeds(i) for i in preempted_encoder_inputs) - encoder_compute_budget += num_tokens_to_restore + encoder_compute_budget += num_embeds_to_restore req_index -= 1 else: preempted_req = self.running.pop()