diff --git a/vllm_ascend/patch/worker/patch_model_runner.py b/vllm_ascend/patch/worker/patch_model_runner.py index 986a1409ae3..9ca38e2590a 100644 --- a/vllm_ascend/patch/worker/patch_model_runner.py +++ b/vllm_ascend/patch/worker/patch_model_runner.py @@ -275,7 +275,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # request state. NOTE: The spec tokens are placeholders and not # added to token_ids_cpu. if self.is_kv_consumer and self.speculative_config and \ - self.speculative_config.method == "mtp" and self.use_async_scheduling: + self.use_async_scheduling: req_state = self.requests[request.req_id] spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get( request.req_id, []) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 2a0b3e9a203..21746346c19 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -336,8 +336,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: # When open PD separation + MTP + Full Graph + asynchronous scheduling, # patch gpu_model_runner._update_states for solving # decode nodes may be in eager mode. - if vllm_config.speculative_config and vllm_config.kv_transfer_config and \ - vllm_config.speculative_config.method == "mtp": + if vllm_config.speculative_config and vllm_config.kv_transfer_config: import vllm_ascend.patch.worker.patch_model_runner # noqa # Extend original scheduler_config to use SchedulerDynamicBatch.