lesj0610 · lesj0610 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
@@ -92,8 +92,8 @@ check_and_skip_if_image_exists() {
 }
 
 ecr_login() {
-    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
-    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
+    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true
 }
 
 prepare_cache_tags() {

diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
@@ -11,7 +11,7 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
 
 # skip build if image already exists
 if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then

diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -11,7 +11,7 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
 
 # skip build if image already exists
 if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then

diff --git a/docs/models/pooling_models/token_embed.md b/docs/models/pooling_models/token_embed.md
@@ -9,14 +9,14 @@
 - Online APIs:
     - Pooling API (`/pooling`)
 
-The difference between the (sequence) embedding task and the token embedding task is that (sequence) embedding outputs one embedding for each sequence, while token embedding outputs a embedding for each token.
+The difference between the (sequence) embedding task and the token embedding task is that (sequence) embedding outputs one embedding for each sequence, while token embedding outputs an embedding for each token.
 
 Many embedding models support both (sequence) embedding and token embedding. For further details on (sequence) embedding, please refer to [this page](embed.md).
 
 !!! note
 
     Pooling multitask support is deprecated and will be removed in v0.20. When the default pooling task (embed) is not 
-    what you want, you need to manually specify it via via `PoolerConfig(task="token_embed")` offline or
+    what you want, you need to manually specify it via `PoolerConfig(task="token_embed")` offline or
     `--pooler-config.task token_embed` online.
 
 ## Typical Use Cases

diff --git a/tests/kernels/moe/test_unquantized_backend_selection.py b/tests/kernels/moe/test_unquantized_backend_selection.py
@@ -11,6 +11,11 @@
 )
 from vllm.platforms import current_platform
 
+skipif_not_cuda_rocm = pytest.mark.skipif(
+    not (current_platform.is_cuda() or current_platform.is_rocm()),
+    reason="Only supported on CUDA/ROCm platforms.",
+)
+
 
 @pytest.mark.parametrize(
     "platform_method,expected_backend",
@@ -190,3 +195,83 @@ def test_select_cuda_flashinfer_cutlass_backend(
 
         assert selected_backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS
         assert experts_cls is not None
+
+
+@skipif_not_cuda_rocm
+def test_select_lora_backend_prefers_triton():
+    """LoRA-enabled unquantized MoE should select Triton backend."""
+    moe_config = make_dummy_moe_config()
+    moe_config.is_lora_enabled = True
+    selected_backend, experts_cls = select_unquantized_moe_backend(
+        moe_config=moe_config
+    )
+
+    assert selected_backend == UnquantizedMoeBackend.TRITON
+    assert experts_cls is not None
+
+
+@skipif_not_cuda_rocm
+def test_select_lora_explicit_non_triton_backend():
+    """LoRA should override explicit non-Triton backend to Triton."""
+    moe_config = make_dummy_moe_config()
+    moe_config.is_lora_enabled = True
+
+    # Use string from mapping in function map_unquantized_backend()
+    moe_config.moe_backend = "flashinfer_cutlass"
+
+    selected_backend, experts_cls = select_unquantized_moe_backend(
+        moe_config=moe_config
+    )
+
+    assert selected_backend == UnquantizedMoeBackend.TRITON
+    assert experts_cls is not None
+
+
+@skipif_not_cuda_rocm
+@pytest.mark.parametrize("is_lora_enabled", [False, True])
+def test_select_explicit_triton_backend(is_lora_enabled):
+    """Explicit triton backend selection should return Triton."""
+    moe_config = make_dummy_moe_config()
+    moe_config.is_lora_enabled = is_lora_enabled
+    moe_config.moe_backend = "triton"
+
+    selected_backend, experts_cls = select_unquantized_moe_backend(
+        moe_config=moe_config
+    )
+
+    assert selected_backend == UnquantizedMoeBackend.TRITON
+    assert experts_cls is not None
+
+
+@skipif_not_cuda_rocm
+def test_select_explicit_triton_ignores_flashinfer_env(monkeypatch):
+    """Explicit triton backend should override FlashInfer env selection."""
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
+
+    moe_config = make_dummy_moe_config()
+    moe_config.is_lora_enabled = False
+    moe_config.moe_backend = "triton"
+
+    selected_backend, experts_cls = select_unquantized_moe_backend(
+        moe_config=moe_config
+    )
+
+    assert selected_backend == UnquantizedMoeBackend.TRITON
+    assert experts_cls is not None
+
+
+@skipif_not_cuda_rocm
+def test_select_lora_ignores_flashinfer_env(monkeypatch):
+    """LoRA path should still choose Triton even if FlashInfer env is on."""
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
+
+    moe_config = make_dummy_moe_config()
+    moe_config.is_lora_enabled = True
+    selected_backend, experts_cls = select_unquantized_moe_backend(
+        moe_config=moe_config
+    )
+
+    assert selected_backend == UnquantizedMoeBackend.TRITON
+    assert experts_cls is not None
diff --git a/tests/models/fixtures/ministral_3b_chat.json b/tests/models/fixtures/ministral_3b_chat.json
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
@@ -25,6 +25,7 @@
 
 PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
 MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+MINISTRAL_3B_ID = "mistralai/Ministral-3-3B-Instruct-2512"
 
 MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
 
@@ -116,6 +117,7 @@ def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
 FIXTURE_LOGPROBS_CHAT = {
     PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
     MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
+    MINISTRAL_3B_ID: FIXTURES_PATH / "ministral_3b_chat.json",
 }
 
 OutputsLogprobs = list[tuple[list[int], str, SampleLogprobs | None]]
@@ -209,3 +211,41 @@ def test_chat(
         name_0="h100_ref",
         name_1="output",
     )
+
+
+@large_gpu_test(min_gb=16)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_chat_consolidated(vllm_runner, dtype: str, local_asset_server) -> None:
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
+        FIXTURE_LOGPROBS_CHAT[MINISTRAL_3B_ID]
+    )
+    with vllm_runner(
+        MINISTRAL_3B_ID,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        load_format="mistral",
+        config_format="mistral",
+        max_model_len=8192,
+        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+    ) as vllm_model:
+        outputs = []
+        urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
+        msgs = [
+            _create_msg_format(urls_all[:1]),
+            _create_msg_format(urls_all[:2]),
+            _create_msg_format(urls_all),
+        ]
+        for msg in msgs:
+            output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
+            outputs.extend(output)
+
+    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
+    for i in range(len(logprobs)):
+        assert logprobs[i][-1] is None
+        logprobs[i] = logprobs[i][:-1]
+    check_logprobs_close(
+        outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
+        outputs_1_lst=logprobs,
+        name_0="h100_ref",
+        name_1="output",
+    )
diff --git a/tests/tool_use/test_gemma4_responses_adjust_request.py b/tests/tool_use/test_gemma4_responses_adjust_request.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Regression tests for Responses API tool-calling request adjustment.
+
+Covers two bugs on the ``/v1/responses`` path that broke streaming tool
+calling for parsers relying on special-token delimiters (Gemma4):
+
+1. :class:`Gemma4ToolParser.adjust_request` used an
+   ``isinstance(request, ChatCompletionRequest)`` guard, so a
+   :class:`ResponsesRequest` with tools never had
+   ``skip_special_tokens`` flipped to ``False``. The default (``True``)
+   stripped ``<|tool_call>`` / ``<tool_call|>`` delimiters, causing
+   :meth:`Gemma4ToolParser.extract_tool_calls_streaming` to fall through
+   to the content branch and leak the raw ``call:fn{...}`` body via
+   ``response.output_text.delta``.
+
+2. :meth:`ToolParser.adjust_request` built
+   :class:`ResponseTextConfig` in two steps (bare constructor then
+   ``.format = ...``). Under Pydantic v2 the later assignment is not
+   tracked in ``__fields_set__``, which can drop the nested config from
+   ``model_dump``. It also passed a ``description`` kwarg carrying the
+   wrong-purpose string ``"Response format for tool calling"``.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from openai.types.responses.tool_param import FunctionToolParam
+
+from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
+from vllm.tool_parsers.abstract_tool_parser import ToolParser
+from vllm.tool_parsers.gemma4_tool_parser import Gemma4ToolParser
+
+
+def _get_weather_tool() -> FunctionToolParam:
+    return FunctionToolParam(
+        type="function",
+        name="get_weather",
+        description="Get current weather for a city",
+        parameters={
+            "type": "object",
+            "properties": {"city": {"type": "string"}},
+            "required": ["city"],
+        },
+        strict=True,
+    )
+
+
+def _build_responses_request(*, tool_choice: str) -> ResponsesRequest:
+    return ResponsesRequest(
+        model="gemma4-test",
+        input=[{"role": "user", "content": "What is the weather in Hanoi?"}],
+        tools=[_get_weather_tool()],
+        tool_choice=tool_choice,
+        stream=True,
+        max_output_tokens=200,
+    )
+
+
+class _StubTokenizer:
+    """Minimal tokenizer stub to satisfy ``Gemma4ToolParser.__init__``."""
+
+    def get_vocab(self) -> dict[str, int]:
+        return {"<|tool_call>": 256_000, "<tool_call|>": 256_001, '<|"|>': 52}
+
+
+def test_gemma4_adjust_request_sets_skip_special_tokens_on_responses() -> None:
+    """``Gemma4ToolParser.adjust_request`` must flip
+    ``skip_special_tokens=False`` for both ``ChatCompletionRequest`` and
+    ``ResponsesRequest`` so that ``<|tool_call>`` delimiters reach the
+    streaming extractor. The previous
+    ``isinstance(ChatCompletionRequest)`` guard omitted the Responses
+    path, causing raw ``call:fn{...}`` text to leak via
+    ``response.output_text.delta``.
+    """
+    parser = Gemma4ToolParser.__new__(Gemma4ToolParser)
+    parser.model_tokenizer = _StubTokenizer()
+
+    request = _build_responses_request(tool_choice="auto")
+    assert request.skip_special_tokens is True, (
+        "Precondition: ResponsesRequest.skip_special_tokens default is True"
+    )
+
+    Gemma4ToolParser.adjust_request(parser, request)
+
+    assert request.skip_special_tokens is False
+
+
+def test_tool_parser_adjust_request_builds_valid_response_text_config() -> None:
+    """``ToolParser.adjust_request`` must produce a ``ResponseTextConfig``
+    whose dumped form contains the JSON schema under the ``schema`` alias
+    and does not leak the unrelated ``"Response format for tool calling"``
+    description string that the previous two-step construction injected.
+    """
+    parser = ToolParser.__new__(ToolParser)
+    parser.model_tokenizer = None
+
+    request = _build_responses_request(tool_choice="required")
+    ToolParser.adjust_request(parser, request)
+
+    assert request.text is not None
+    assert request.text.format is not None
+    assert request.text.format.type == "json_schema"
+
+    dump: dict[str, Any] = request.text.model_dump(mode="json", by_alias=True)
+    fmt = dump.get("format") or {}
+    assert fmt.get("type") == "json_schema"
+    assert fmt.get("name") == "tool_calling_response"
+    assert fmt.get("strict") is True
+    # Nested config must be present under the alias. Two-step Pydantic v2
+    # construction could drop it from __fields_set__.
+    assert "schema" in fmt and isinstance(fmt["schema"], dict)
+    # The old code passed a wrong-purpose string; valid field should now
+    # either be absent or None (the openai-python default).
+    assert fmt.get("description") in (None, "")
diff --git a/vllm/entrypoints/openai/responses/utils.py b/vllm/entrypoints/openai/responses/utils.py
@@ -264,7 +264,7 @@ def convert_tool_responses_to_completions_format(tool: dict) -> dict:
 def construct_tool_dicts(
     tools: list[Tool], tool_choice: ToolChoice
 ) -> list[dict[str, Any]] | None:
-    if tools is None or (tool_choice == "none"):
+    if not tools or (tool_choice == "none"):
         tool_dicts = None
     else:
         tool_dicts = [

diff --git a/vllm/lora/ops/xpu_ops/lora_ops.py b/vllm/lora/ops/xpu_ops/lora_ops.py
@@ -27,9 +27,42 @@ def bgmv_expand(
     lora_indices_tensor: torch.Tensor,
     add_inputs: bool = True,
 ) -> None:
-    torch.ops._xpu_C.bgmv_expand(
-        output_tensor, inputs, lora_b_weights, lora_indices_tensor, add_inputs
-    )
+    weight_out_dim = lora_b_weights.size(-2)
+    output_dim = output_tensor.size(1)
+
+    if weight_out_dim == output_dim:
+        torch.ops._xpu_C.bgmv_expand(
+            output_tensor,
+            inputs,
+            lora_b_weights,
+            lora_indices_tensor,
+            add_inputs,
+        )
+    elif weight_out_dim < output_dim:
+        # LoRA weight output dim can be smaller than the output tensor
+        # (e.g. vocab_size vs padded logits). Use expand_slice to write
+        # only the matching portion, mirroring torch_ops common_len logic.
+        torch.ops._xpu_C.bgmv_expand_slice(
+            output_tensor,
+            inputs,
+            lora_b_weights,
+            lora_indices_tensor,
+            0,
+            weight_out_dim,
+            add_inputs,
+        )
+    else:
+        # Weight output dim larger than output tensor: truncate weights.
+        lora_b_weights = lora_b_weights[..., :output_dim, :].contiguous()
+        torch.ops._xpu_C.bgmv_expand_slice(
+            output_tensor,
+            inputs,
+            lora_b_weights,
+            lora_indices_tensor,
+            0,
+            output_dim,
+            add_inputs,
+        )
 
 
 def bgmv_expand_slice(

diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
@@ -163,6 +163,11 @@ def select_unquantized_moe_backend(
     if current_platform.is_out_of_tree():
         return UnquantizedMoeBackend.OOT, None
 
+    if moe_config.is_lora_enabled:
+        return UnquantizedMoeBackend.TRITON, backend_to_kernel_cls(
+            UnquantizedMoeBackend.TRITON
+        )
+
     # NOTE: the kernels are selected in the following order.
     AVAILABLE_BACKENDS = _get_priority_backends(moe_config)