Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .buildkite/image_build/image_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ check_and_skip_if_image_exists() {
}

ecr_login() {
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true
}

prepare_cache_tags() {
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/image_build/image_build_cpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ REPO=$2
BUILDKITE_COMMIT=$3

# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true

# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/image_build/image_build_cpu_arm64.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ REPO=$2
BUILDKITE_COMMIT=$3

# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true

# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
Expand Down
4 changes: 2 additions & 2 deletions docs/models/pooling_models/token_embed.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
- Online APIs:
- Pooling API (`/pooling`)

The difference between the (sequence) embedding task and the token embedding task is that (sequence) embedding outputs one embedding for each sequence, while token embedding outputs a embedding for each token.
The difference between the (sequence) embedding task and the token embedding task is that (sequence) embedding outputs one embedding for each sequence, while token embedding outputs an embedding for each token.

Many embedding models support both (sequence) embedding and token embedding. For further details on (sequence) embedding, please refer to [this page](embed.md).

!!! note

Pooling multitask support is deprecated and will be removed in v0.20. When the default pooling task (embed) is not
what you want, you need to manually specify it via via `PoolerConfig(task="token_embed")` offline or
what you want, you need to manually specify it via `PoolerConfig(task="token_embed")` offline or
`--pooler-config.task token_embed` online.

## Typical Use Cases
Expand Down
85 changes: 85 additions & 0 deletions tests/kernels/moe/test_unquantized_backend_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
)
from vllm.platforms import current_platform

skipif_not_cuda_rocm = pytest.mark.skipif(
not (current_platform.is_cuda() or current_platform.is_rocm()),
reason="Only supported on CUDA/ROCm platforms.",
)


@pytest.mark.parametrize(
"platform_method,expected_backend",
Expand Down Expand Up @@ -190,3 +195,83 @@ def test_select_cuda_flashinfer_cutlass_backend(

assert selected_backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS
assert experts_cls is not None


@skipif_not_cuda_rocm
def test_select_lora_backend_prefers_triton():
"""LoRA-enabled unquantized MoE should select Triton backend."""
moe_config = make_dummy_moe_config()
moe_config.is_lora_enabled = True
selected_backend, experts_cls = select_unquantized_moe_backend(
moe_config=moe_config
)

assert selected_backend == UnquantizedMoeBackend.TRITON
assert experts_cls is not None


@skipif_not_cuda_rocm
def test_select_lora_explicit_non_triton_backend():
"""LoRA should override explicit non-Triton backend to Triton."""
moe_config = make_dummy_moe_config()
moe_config.is_lora_enabled = True

# Use string from mapping in function map_unquantized_backend()
moe_config.moe_backend = "flashinfer_cutlass"

selected_backend, experts_cls = select_unquantized_moe_backend(
moe_config=moe_config
)

assert selected_backend == UnquantizedMoeBackend.TRITON
assert experts_cls is not None


@skipif_not_cuda_rocm
@pytest.mark.parametrize("is_lora_enabled", [False, True])
def test_select_explicit_triton_backend(is_lora_enabled):
"""Explicit triton backend selection should return Triton."""
moe_config = make_dummy_moe_config()
moe_config.is_lora_enabled = is_lora_enabled
moe_config.moe_backend = "triton"

selected_backend, experts_cls = select_unquantized_moe_backend(
moe_config=moe_config
)

assert selected_backend == UnquantizedMoeBackend.TRITON
assert experts_cls is not None


@skipif_not_cuda_rocm
def test_select_explicit_triton_ignores_flashinfer_env(monkeypatch):
"""Explicit triton backend should override FlashInfer env selection."""
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")

moe_config = make_dummy_moe_config()
moe_config.is_lora_enabled = False
moe_config.moe_backend = "triton"

selected_backend, experts_cls = select_unquantized_moe_backend(
moe_config=moe_config
)

assert selected_backend == UnquantizedMoeBackend.TRITON
assert experts_cls is not None


@skipif_not_cuda_rocm
def test_select_lora_ignores_flashinfer_env(monkeypatch):
"""LoRA path should still choose Triton even if FlashInfer env is on."""
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")

moe_config = make_dummy_moe_config()
moe_config.is_lora_enabled = True
selected_backend, experts_cls = select_unquantized_moe_backend(
moe_config=moe_config
)

assert selected_backend == UnquantizedMoeBackend.TRITON
assert experts_cls is not None
1 change: 1 addition & 0 deletions tests/models/fixtures/ministral_3b_chat.json

Large diffs are not rendered by default.

40 changes: 40 additions & 0 deletions tests/models/multimodal/generation/test_pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MINISTRAL_3B_ID = "mistralai/Ministral-3-3B-Instruct-2512"

MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]

Expand Down Expand Up @@ -116,6 +117,7 @@ def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
FIXTURE_LOGPROBS_CHAT = {
PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
MINISTRAL_3B_ID: FIXTURES_PATH / "ministral_3b_chat.json",
}

OutputsLogprobs = list[tuple[list[int], str, SampleLogprobs | None]]
Expand Down Expand Up @@ -209,3 +211,41 @@ def test_chat(
name_0="h100_ref",
name_1="output",
)


@large_gpu_test(min_gb=16)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_chat_consolidated(vllm_runner, dtype: str, local_asset_server) -> None:
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
FIXTURE_LOGPROBS_CHAT[MINISTRAL_3B_ID]
)
with vllm_runner(
MINISTRAL_3B_ID,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
max_model_len=8192,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
outputs = []
urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
msgs = [
_create_msg_format(urls_all[:1]),
_create_msg_format(urls_all[:2]),
_create_msg_format(urls_all),
]
for msg in msgs:
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
outputs.extend(output)

logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
for i in range(len(logprobs)):
assert logprobs[i][-1] is None
logprobs[i] = logprobs[i][:-1]
check_logprobs_close(
outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
outputs_1_lst=logprobs,
name_0="h100_ref",
name_1="output",
)
116 changes: 116 additions & 0 deletions tests/tool_use/test_gemma4_responses_adjust_request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Regression tests for Responses API tool-calling request adjustment.

Covers two bugs on the ``/v1/responses`` path that broke streaming tool
calling for parsers relying on special-token delimiters (Gemma4):

1. :class:`Gemma4ToolParser.adjust_request` used an
``isinstance(request, ChatCompletionRequest)`` guard, so a
:class:`ResponsesRequest` with tools never had
``skip_special_tokens`` flipped to ``False``. The default (``True``)
stripped ``<|tool_call>`` / ``<tool_call|>`` delimiters, causing
:meth:`Gemma4ToolParser.extract_tool_calls_streaming` to fall through
to the content branch and leak the raw ``call:fn{...}`` body via
``response.output_text.delta``.

2. :meth:`ToolParser.adjust_request` built
:class:`ResponseTextConfig` in two steps (bare constructor then
``.format = ...``). Under Pydantic v2 the later assignment is not
tracked in ``__fields_set__``, which can drop the nested config from
``model_dump``. It also passed a ``description`` kwarg carrying the
wrong-purpose string ``"Response format for tool calling"``.
"""

from __future__ import annotations

from typing import Any

from openai.types.responses.tool_param import FunctionToolParam

from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
from vllm.tool_parsers.abstract_tool_parser import ToolParser
from vllm.tool_parsers.gemma4_tool_parser import Gemma4ToolParser


def _get_weather_tool() -> FunctionToolParam:
return FunctionToolParam(
type="function",
name="get_weather",
description="Get current weather for a city",
parameters={
"type": "object",
"properties": {"city": {"type": "string"}},
"required": ["city"],
},
strict=True,
)


def _build_responses_request(*, tool_choice: str) -> ResponsesRequest:
return ResponsesRequest(
model="gemma4-test",
input=[{"role": "user", "content": "What is the weather in Hanoi?"}],
tools=[_get_weather_tool()],
tool_choice=tool_choice,
stream=True,
max_output_tokens=200,
)


class _StubTokenizer:
"""Minimal tokenizer stub to satisfy ``Gemma4ToolParser.__init__``."""

def get_vocab(self) -> dict[str, int]:
return {"<|tool_call>": 256_000, "<tool_call|>": 256_001, '<|"|>': 52}


def test_gemma4_adjust_request_sets_skip_special_tokens_on_responses() -> None:
"""``Gemma4ToolParser.adjust_request`` must flip
``skip_special_tokens=False`` for both ``ChatCompletionRequest`` and
``ResponsesRequest`` so that ``<|tool_call>`` delimiters reach the
streaming extractor. The previous
``isinstance(ChatCompletionRequest)`` guard omitted the Responses
path, causing raw ``call:fn{...}`` text to leak via
``response.output_text.delta``.
"""
parser = Gemma4ToolParser.__new__(Gemma4ToolParser)
parser.model_tokenizer = _StubTokenizer()

request = _build_responses_request(tool_choice="auto")
assert request.skip_special_tokens is True, (
"Precondition: ResponsesRequest.skip_special_tokens default is True"
)

Gemma4ToolParser.adjust_request(parser, request)

assert request.skip_special_tokens is False


def test_tool_parser_adjust_request_builds_valid_response_text_config() -> None:
"""``ToolParser.adjust_request`` must produce a ``ResponseTextConfig``
whose dumped form contains the JSON schema under the ``schema`` alias
and does not leak the unrelated ``"Response format for tool calling"``
description string that the previous two-step construction injected.
"""
parser = ToolParser.__new__(ToolParser)
parser.model_tokenizer = None

request = _build_responses_request(tool_choice="required")
ToolParser.adjust_request(parser, request)

assert request.text is not None
assert request.text.format is not None
assert request.text.format.type == "json_schema"

dump: dict[str, Any] = request.text.model_dump(mode="json", by_alias=True)
fmt = dump.get("format") or {}
assert fmt.get("type") == "json_schema"
assert fmt.get("name") == "tool_calling_response"
assert fmt.get("strict") is True
# Nested config must be present under the alias. Two-step Pydantic v2
# construction could drop it from __fields_set__.
assert "schema" in fmt and isinstance(fmt["schema"], dict)
# The old code passed a wrong-purpose string; valid field should now
# either be absent or None (the openai-python default).
assert fmt.get("description") in (None, "")
2 changes: 1 addition & 1 deletion vllm/entrypoints/openai/responses/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def convert_tool_responses_to_completions_format(tool: dict) -> dict:
def construct_tool_dicts(
tools: list[Tool], tool_choice: ToolChoice
) -> list[dict[str, Any]] | None:
if tools is None or (tool_choice == "none"):
if not tools or (tool_choice == "none"):
tool_dicts = None
else:
tool_dicts = [
Expand Down
39 changes: 36 additions & 3 deletions vllm/lora/ops/xpu_ops/lora_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,42 @@ def bgmv_expand(
lora_indices_tensor: torch.Tensor,
add_inputs: bool = True,
) -> None:
torch.ops._xpu_C.bgmv_expand(
output_tensor, inputs, lora_b_weights, lora_indices_tensor, add_inputs
)
weight_out_dim = lora_b_weights.size(-2)
output_dim = output_tensor.size(1)

if weight_out_dim == output_dim:
torch.ops._xpu_C.bgmv_expand(
output_tensor,
inputs,
lora_b_weights,
lora_indices_tensor,
add_inputs,
)
elif weight_out_dim < output_dim:
# LoRA weight output dim can be smaller than the output tensor
# (e.g. vocab_size vs padded logits). Use expand_slice to write
# only the matching portion, mirroring torch_ops common_len logic.
torch.ops._xpu_C.bgmv_expand_slice(
output_tensor,
inputs,
lora_b_weights,
lora_indices_tensor,
0,
weight_out_dim,
add_inputs,
)
else:
# Weight output dim larger than output tensor: truncate weights.
lora_b_weights = lora_b_weights[..., :output_dim, :].contiguous()
torch.ops._xpu_C.bgmv_expand_slice(
output_tensor,
inputs,
lora_b_weights,
lora_indices_tensor,
0,
output_dim,
add_inputs,
)


def bgmv_expand_slice(
Expand Down
5 changes: 5 additions & 0 deletions vllm/model_executor/layers/fused_moe/oracle/unquantized.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,11 @@ def select_unquantized_moe_backend(
if current_platform.is_out_of_tree():
return UnquantizedMoeBackend.OOT, None

if moe_config.is_lora_enabled:
return UnquantizedMoeBackend.TRITON, backend_to_kernel_cls(
UnquantizedMoeBackend.TRITON
)

# NOTE: the kernels are selected in the following order.
AVAILABLE_BACKENDS = _get_priority_backends(moe_config)

Expand Down
Loading
Loading