From d82063321d100b4db103a8dd2c7beedf4cbe0598 Mon Sep 17 00:00:00 2001 From: wangli Date: Tue, 24 Jun 2025 17:34:32 +0800 Subject: [PATCH 1/4] fix bug Signed-off-by: wangli --- vllm_ascend/worker/model_runner_v1.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 5cf6bb324af..15dab8146f0 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -274,6 +274,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): dtype=torch.int64, device="cpu", pin_memory=True) + self.mrope_positions_np = self.mrope_positions_cpu.numpy() if self.is_multimodal_model: self.inputs_embeds = torch.zeros( @@ -793,15 +794,13 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): dst_start = mrope_pos_ptr dst_end = mrope_pos_ptr + completion_part_len - self.mrope_positions_cpu[:, dst_start:dst_end] = \ - MRotaryEmbedding.get_next_input_positions_tensor( - req.mrope_position_delta, - context_len=num_computed_tokens + - prompt_part_len, - seq_len=num_computed_tokens + - prompt_part_len + - completion_part_len, - ) + MRotaryEmbedding.get_next_input_positions_tensor( + out=self.mrope_positions_np, + out_offset=dst_start, + mrope_position_delta=req.mrope_position_delta, + context_len=num_computed_tokens + prompt_part_len, + num_new_tokens=completion_part_len, + ) mrope_pos_ptr += completion_part_len From bfabf6b6eeeb756f9368cd911ee2e735ba97c3d4 Mon Sep 17 00:00:00 2001 From: wangli Date: Tue, 24 Jun 2025 18:46:50 +0800 Subject: [PATCH 2/4] fix Signed-off-by: wangli --- vllm_ascend/worker/model_runner_v1.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 15dab8146f0..e381dcb5087 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -794,6 +794,17 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): dst_start = mrope_pos_ptr dst_end = mrope_pos_ptr + completion_part_len + if vllm_version_is("0.9.1"): + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + MRotaryEmbedding.get_next_input_positions_tensor( + req.mrope_position_delta, + context_len=num_computed_tokens + + prompt_part_len, + seq_len=num_computed_tokens + + prompt_part_len + + completion_part_len, + ) + else: MRotaryEmbedding.get_next_input_positions_tensor( out=self.mrope_positions_np, out_offset=dst_start, From 3651692d0320e981360a76e157b5d8ad5909b594 Mon Sep 17 00:00:00 2001 From: wangli Date: Tue, 24 Jun 2025 20:30:53 +0800 Subject: [PATCH 3/4] fix Signed-off-by: wangli --- vllm_ascend/worker/model_runner_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index e381dcb5087..7ff18a1a523 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -813,7 +813,7 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): num_new_tokens=completion_part_len, ) - mrope_pos_ptr += completion_part_len + mrope_pos_ptr += completion_part_len def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs From 4349161fbdadb47d0ac40dafe34dbc1fd2ab633c Mon Sep 17 00:00:00 2001 From: wangli Date: Tue, 24 Jun 2025 20:38:22 +0800 Subject: [PATCH 4/4] fix Signed-off-by: wangli --- vllm_ascend/worker/model_runner_v1.py | 34 +++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 7ff18a1a523..1ddc7bb1a4d 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -794,26 +794,26 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): dst_start = mrope_pos_ptr dst_end = mrope_pos_ptr + completion_part_len - if vllm_version_is("0.9.1"): - self.mrope_positions_cpu[:, dst_start:dst_end] = \ + if vllm_version_is("0.9.1"): + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + MRotaryEmbedding.get_next_input_positions_tensor( + req.mrope_position_delta, + context_len=num_computed_tokens + + prompt_part_len, + seq_len=num_computed_tokens + + prompt_part_len + + completion_part_len, + ) + else: MRotaryEmbedding.get_next_input_positions_tensor( - req.mrope_position_delta, - context_len=num_computed_tokens + - prompt_part_len, - seq_len=num_computed_tokens + - prompt_part_len + - completion_part_len, + out=self.mrope_positions_np, + out_offset=dst_start, + mrope_position_delta=req.mrope_position_delta, + context_len=num_computed_tokens + prompt_part_len, + num_new_tokens=completion_part_len, ) - else: - MRotaryEmbedding.get_next_input_positions_tensor( - out=self.mrope_positions_np, - out_offset=dst_start, - mrope_position_delta=req.mrope_position_delta, - context_len=num_computed_tokens + prompt_part_len, - num_new_tokens=completion_part_len, - ) - mrope_pos_ptr += completion_part_len + mrope_pos_ptr += completion_part_len def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs