vllm-project · mgoin · May 7, 2025 · Nov 20, 2024 · Nov 20, 2024 · Nov 21, 2024
diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
@@ -37,6 +37,8 @@ docker run --privileged --net host --shm-size=16G -it \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
     && echo TEST_7 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
+    && echo TEST_8 \
+    && pytest -s -v /workspace/vllm/tests/tpu/test_lora.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling

@@ -70,7 +70,7 @@ def dist_init():
     temp_file = tempfile.mkstemp()[1]
 
     backend = "nccl"
-    if current_platform.is_cpu():
+    if current_platform.is_cpu() or current_platform.is_tpu():
         backend = "gloo"
 
     init_distributed_environment(world_size=1,

diff --git a/tests/lora/tpu/__init__.py b/tests/lora/tpu/__init__.py
diff --git a/tests/lora/tpu/test_pallas_kernels.py b/tests/lora/tpu/test_pallas_kernels.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+# Required to register the custom ops
+import vllm.lora.ops.xla_ops.pallas  # noqa # pylint: disable=unused-import
+
+N_TOKENS = [
+    8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536,
+    131072
+]
+HIDDEN_SIZES = [128, 256, 512, 896, 1024, 2048, 4096, 8192, 8320]
+
+DTYPES = [torch.float16, torch.bfloat16]
+NUM_LORA = [1, 2, 4, 8, 16, 32]
+RANKS = [8, 16, 32, 64, 128]
+
+
+def generate_test_data(T, D, L, N, seed, dtype=torch.float32):
+    """
+    Inputs: (All integers)
+        T: Total number of tokens
+        D: Input dim
+        L: LoRA Dim
+        N: N LoRAs
+
+    Outputs:
+        inputs:     torch.Tensor - shape (T, D)
+        loras:      torch.Tensor - shape (N, 1, L, D)
+        idxs:       torch.Tensor - shape (T, ) - all values must be in [0, N)
+
+        ref_output: torch.Tensor - shape (T, L) - inputs @ loras[idxs].T
+    """
+    torch.manual_seed(seed)
+
+    inputs = torch.randn((T, D), device="xla", dtype=dtype)
+    loras = torch.randn((N, 1, L, D), device="xla", dtype=dtype)
+    idxs = torch.randint(0, N, (T, ), dtype=torch.int32, device="xla")
+
+    ref_output = ref_bgmv(inputs, loras, idxs)
+    return inputs, loras, idxs, ref_output
+
+
+def ref_bgmv(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.Tensor):
+    selected_loras = loras[idxs]
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(axis=1)
+
+    batch_size, output_size, input_size = selected_loras.shape
+    return (selected_loras @ inputs.reshape(
+        (batch_size, input_size, 1))).reshape((batch_size, output_size))
+
+
+# Parameterize tests with various shapes and dtypes
+@pytest.mark.parametrize("T", N_TOKENS)
+@pytest.mark.parametrize("D", HIDDEN_SIZES)
+@pytest.mark.parametrize("L", RANKS)
+@pytest.mark.parametrize("N", NUM_LORA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", [0])
+def test_bgmv_correctness(T, D, L, N, dtype, op_type, seed):
+    if op_type == "expand":
+        D, L = L, D
+
+    inputs, loras, idxs, ref_output = generate_test_data(
+        T, D, L, N, seed, dtype)
+
+    # Run bgmv
+    output = torch.ops.xla.bgmv(inputs, loras, idxs)
+
+    # Make sure we have no NaNs
+    assert not torch.any(torch.isnan(output))
+
+    # Compare with reference output
+    assert torch.allclose(output, ref_output, rtol=1e-3, atol=1e-3)
diff --git a/tests/tpu/test_lora.py b/tests/tpu/test_lora.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+import vllm
+from vllm.lora.request import LoRARequest
+
+
+def test_lora_hotswapping():
+    lora_name_template = \
+        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
+    lora_requests = [
+        LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
+        for i in range(1, 5)
+    ]
+
+    llm = vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
+                   num_scheduler_steps=1,
+                   max_model_len=256,
+                   max_seq_len_to_capture=256,
+                   max_num_seqs=8,
+                   enable_lora=True,
+                   max_loras=2,
+                   max_lora_rank=8)
+
+    prompt = "What is 1+1? \n"
+
+    for _ in range(10):
+        for i, req in enumerate(lora_requests):
+            output = llm.generate(prompt,
+                                  sampling_params=vllm.SamplingParams(
+                                      max_tokens=256, temperature=0),
+                                  lora_request=req)[0].outputs[0].text
+            assert int(output.strip()[0]) == i + 1
diff --git a/vllm/config.py b/vllm/config.py
@@ -29,7 +29,7 @@
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import CpuArchEnum
+from vllm.platforms import CpuArchEnum, current_platform
 from vllm.sampling_params import GuidedDecodingParams
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
@@ -2382,8 +2382,8 @@ class LoRAConfig:
     max_cpu_loras: Optional[int] = None
     lora_dtype: Optional[Union[torch.dtype, str]] = None
     lora_extra_vocab_size: int = 256
-    # This is a constant.
-    lora_vocab_padding_size: ClassVar[int] = 256
+    lora_vocab_padding_size: ClassVar[int] = current_platform\
+        .get_lora_vocab_padding_size()
     long_lora_scaling_factors: Optional[tuple[float]] = None
     bias_enabled: bool = False
 
@@ -2405,6 +2405,7 @@ def compute_hash(self) -> str:
         factors.append(self.fully_sharded_loras)
         factors.append(self.lora_dtype)
         factors.append(self.lora_extra_vocab_size)
+        factors.append(self.lora_vocab_padding_size)
         factors.append(self.long_lora_scaling_factors)
         factors.append(self.bias_enabled)
         hash_str = hashlib.md5(str(factors).encode(),

@@ -261,10 +261,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 full_lora_a_embeddings.shape[1],
                 -1,
             )
-        self.punica_wrapper.add_lora_embedding(full_output,
-                                               full_lora_a_embeddings,
-                                               self.lora_b_stacked,
-                                               add_input=True)
+
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_embedding(
+                full_output,
+                full_lora_a_embeddings,
+                self.lora_b_stacked,
+                add_input=True)
+
+        if not current_platform.can_update_inplace():
+            full_output = lora_output
+
         return full_output.view_as(full_output_org)
 
     @classmethod
@@ -410,10 +417,13 @@ def apply(self,
             output = output.flatten(0, 1)
             x = x.flatten(0, 1)
 
-        self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
-                                            self.lora_b_stacked,
-                                            self.lora_bias_stacked, 1.0,
-                                            self.output_slices)
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_linear(
+                output, x, self.lora_a_stacked, self.lora_b_stacked,
+                self.lora_bias_stacked, 1.0, self.output_slices)
+        if not current_platform.can_update_inplace():
+            output = lora_output
+
         return output
 
     @property
@@ -1128,15 +1138,23 @@ def _get_logits(
         torch.matmul(self.embeddings_tensors,
                      hidden_states.T,
                      out=lora_logits[:-1])
-        lora_logits[-1] = float("-inf")
+
+        neg_inf, pos_inf = current_platform.get_infinity_values(
+            lora_logits.dtype)
+
+        lora_logits[-1] = neg_inf
         lora_logits = lora_logits.mT
         indices_padded = self.punica_wrapper.sampler_indices_padded
+
+        if current_platform.is_tpu():
+            indices_padded = indices_padded[:logits.size(0)]
+
         lora_logits = (lora_logits.reshape(
             lora_logits.shape[0] * lora_logits.shape[1],
             lora_logits.shape[2],
-        ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"),
-                                                      posinf=float("inf"),
-                                                      neginf=float("-inf")))
+        ).index_select(0, indices_padded).nan_to_num_(nan=neg_inf,
+                                                      posinf=pos_inf,
+                                                      neginf=neg_inf))
 
         # HPU needs special handling to prune out dummy samples.
         if current_platform.is_hpu():
@@ -1146,10 +1164,13 @@ def _get_logits(
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1]] = lora_logits
 
-        # LogitsProcessorWithLoRA always using bgmv
-        self.punica_wrapper.add_lora_logits(logits, hidden_states,
-                                            self.lora_a_stacked,
-                                            self.lora_b_stacked, 1.0)
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_logits(
+                logits, hidden_states, self.lora_a_stacked,
+                self.lora_b_stacked, 1.0)
+
+        if not current_platform.can_update_inplace():
+            logits = lora_output
 
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]

@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
+                                            bgmv_shrink)
+
+__all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+# Required to register the custom ops
+import vllm.lora.ops.xla_ops.pallas  # noqa # pylint: disable=unused-import
+
+
+def bgmv_expand(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                add_inputs: bool = True):
+    outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
+    n_tokens = outputs.size(0)
+
+    limit = output_tensor.shape[0]
+    if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
+        limit = 1
+
+    outputs = torch.cat(
+        (outputs,
+         torch.zeros((n_tokens, output_tensor.shape[1] - outputs.shape[1]),
+                     device=outputs.device)),
+        dim=1)
+
+    if add_inputs:
+        return output_tensor + outputs[:limit, :]
+    else:
+        return outputs[:limit, :]
+
+
+def bgmv_shrink(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                scaling: float = 1.0):
+
+    return scaling * torch.ops.xla.bgmv(inputs, lora_b_weights,
+                                        lora_indices_tensor)
+
+
+def bgmv_expand_slice(inputs: torch.Tensor,
+                      lora_b_weights: torch.Tensor,
+                      output_tensor: torch.Tensor,
+                      lora_indices_tensor: torch.Tensor,
+                      slice_offset: int,
+                      slice_size: int,
+                      add_inputs: bool = True):
+    outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
+    n_tokens = outputs.size(0)
+
+    outputs = torch.cat((
+        torch.zeros((n_tokens, slice_offset), device=outputs.device),
+        outputs,
+        torch.zeros(
+            (n_tokens, output_tensor.shape[1] - (slice_offset + slice_size)),
+            device=outputs.device),
+    ),
+                        dim=1)
+
+    if add_inputs:
+        return output_tensor + outputs
+    else:
+        return outputs