Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm/lora/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1151,7 +1151,7 @@ def _get_logits(
lora_logits = lora_logits.mT
indices_padded = self.punica_wrapper.sampler_indices_padded

if current_platform.is_tpu():
if current_platform.is_tpu() or current_platform.is_xpu():
indices_padded = indices_padded[:logits.size(0)]

lora_logits = (lora_logits.reshape(
Expand Down
13 changes: 10 additions & 3 deletions vllm/lora/punica_wrapper/punica_xpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,13 @@ def add_lora_linear(self,
add_inputs=True,
**kwargs)

@property
def sampler_indices_padded(self) -> torch.Tensor:
"""
This property provides access to padded sampler indices.
"""
return self._sampler_indices_padded[:]

def add_lora_logits(self,
y: torch.Tensor,
x: torch.Tensor,
Expand Down Expand Up @@ -259,11 +266,11 @@ def add_lora_logits(self,
buffer = torch.zeros((x.size(0), r),
dtype=torch.float32,
device=x.device)

bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0))
bgmv_shrink(x, lora_a_stacked, buffer, sampler_indices, scale)
bgmv_expand(buffer,
lora_b_stacked,
y,
self.sampler_indices,
sampler_indices,
add_inputs=True)
Comment on lines 269 to 275
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The change from self.sampler_indices to a sliced version of self._sampler_indices is a crucial correctness fix. The bgmv_shrink and bgmv_expand kernels expect the lora_indices_tensor to match the number of tokens in the input tensor x. The original implementation using self.sampler_indices could lead to a size mismatch, as its length is based on the number of sequence groups, not tokens. This change ensures the indices tensor has the correct size, making the implementation more robust and correct. It also aligns this logic with the TPU backend implementation.

return y.view_as(y_org)
5 changes: 4 additions & 1 deletion vllm/platforms/xpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
cache_config.block_size = 64

# lazy import to avoid circular import
from vllm.config import CUDAGraphMode
from vllm.config import CompilationLevel, CUDAGraphMode
compilation_config = vllm_config.compilation_config
if compilation_config.cudagraph_mode is None or \
compilation_config.cudagraph_mode.max_cudagraph_mode() \
Expand All @@ -100,6 +100,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
"cudagraphs. Fallback to cudagraph_mode=NONE")
compilation_config.cudagraph_mode = CUDAGraphMode.NONE

if vllm_config.lora_config is not None:
compilation_config.level = CompilationLevel.NO_COMPILATION

# check and update parallel config
parallel_config = vllm_config.parallel_config
parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
Expand Down