diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 66250f816f45..a80768c33a51 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -67,8 +67,9 @@ def forward_xpu(self, *args, **kwargs): return self.forward_native(*args, **kwargs) def forward_cpu(self, *args, **kwargs): - # By default, we assume that CPU ops are compatible with CUDA ops. - return self.forward_cuda(*args, **kwargs) + # By default, we assume that CPU ops are compatible with the + # PyTorch-native implementation. + return self.forward_native(*args, **kwargs) def forward_tpu(self, *args, **kwargs): # By default, we assume that TPU ops are compatible with the diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index 7e83ea9a1355..bd82728ed15f 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -250,6 +250,28 @@ def forward_xpu( ) return query, key + def forward_cpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + from vllm import _custom_ops as ops + + self._match_cos_sin_cache_dtype(query) + + # ops.rotary_embedding() is an in-place operation + # that updates the query and key tensors. + ops.rotary_embedding( + positions, + query, + key, + self.head_size, + self.cos_sin_cache, + self.is_neox_style, + ) + return query, key + def extra_repr(self) -> str: s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" s += f", max_position_embeddings={self.max_position_embeddings}"