From b5adae64d4cf94f650d1606566bcc20315ea8586 Mon Sep 17 00:00:00 2001 From: qizixi Date: Fri, 27 Jun 2025 09:00:01 -0700 Subject: [PATCH 01/63] run eagle with full cudagraph Signed-off-by: qizixi --- examples/offline_inference/eagle.py | 4 ++++ vllm/v1/spec_decode/eagle.py | 24 +++++++++++++++++++++++- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py index f4193fdb8bd3..e25b27bf4879 100644 --- a/examples/offline_inference/eagle.py +++ b/examples/offline_inference/eagle.py @@ -48,6 +48,7 @@ def parse_args(): parser.add_argument("--enable_chunked_prefill", action="store_true") parser.add_argument("--max_num_batched_tokens", type=int, default=2048) parser.add_argument("--temp", type=float, default=0) + parser.add_argument("--compilation_config", type=str, default="") return parser.parse_args() @@ -94,6 +95,9 @@ def main(): "max_model_len": max_model_len, }, disable_log_stats=False, + compilation_config=( + json.loads(args.compilation_config) if args.compilation_config else None + ), ) sampling_params = SamplingParams(temperature=args.temp, max_tokens=256) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 153b67fe5714..630f381fed49 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, Optional + import torch import torch.nn as nn @@ -74,6 +76,7 @@ def __init__( 1, device=device, dtype=torch.int32) + self.draft_attn_metadata = None def propose( self, @@ -169,6 +172,13 @@ def propose( self.positions[:num_tokens] = target_positions self.hidden_states[:num_tokens] = target_hidden_states + # copy attention metadata for full cudagraph mode + if self.draft_attn_metadata is not None and num_tokens <= self.cudagraph_batch_sizes[-1]: + self.draft_attn_metadata.seq_lens[:attn_metadata.seq_lens.shape[0]].copy_(attn_metadata.seq_lens.clone()) + self.draft_attn_metadata.slot_mapping[:attn_metadata.slot_mapping.shape[0]].copy_(attn_metadata.slot_mapping.clone()) + self.draft_attn_metadata.query_start_loc[:attn_metadata.query_start_loc.shape[0]].copy_(attn_metadata.query_start_loc.clone()) + self.draft_attn_metadata.block_table[:attn_metadata.block_table.shape[0]].copy_(attn_metadata.block_table.clone()) + with set_forward_context(per_layer_attn_metadata, self.vllm_config, num_tokens=num_input_tokens): @@ -254,6 +264,13 @@ def propose( self.positions[:batch_size] = clamped_positions self.hidden_states[:batch_size] = hidden_states + # copy attention metadata for full cudagraph mode + if self.draft_attn_metadata is not None: + self.draft_attn_metadata.seq_lens[:attn_metadata.seq_lens.shape[0]].copy_(attn_metadata.seq_lens.clone()) + self.draft_attn_metadata.slot_mapping[:attn_metadata.slot_mapping.shape[0]].copy_(attn_metadata.slot_mapping.clone()) + self.draft_attn_metadata.query_start_loc[:attn_metadata.query_start_loc.shape[0]].copy_(attn_metadata.query_start_loc.clone()) + self.draft_attn_metadata.block_table[:attn_metadata.block_table.shape[0]].copy_(attn_metadata.block_table.clone()) + # Run the model. with set_forward_context(per_layer_attn_metadata, self.vllm_config, @@ -369,8 +386,13 @@ def load_model(self, target_model: nn.Module) -> None: def dummy_run( self, num_tokens: int, + attn_metadata: Optional[dict[str, Any]], ) -> None: - with set_forward_context(None, self.vllm_config, + if attn_metadata is not None and self.draft_attn_metadata is None: + attn_metadata[self.attn_layer_names[0]].scheduler_metadata = None + self.draft_attn_metadata = attn_metadata[self.attn_layer_names[0]] # assume only one draft layer + with set_forward_context(attn_metadata, + self.vllm_config, num_tokens=num_tokens): self.model( self.input_ids[:num_tokens], diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 520d8fb186f4..300919a5c226 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1860,7 +1860,7 @@ def maybe_randomize_inputs(self, input_ids: torch.Tensor): Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set. This is to help balance expert-selection - during profile_run - - during DP rank dummy run + - during DP rank dummy run """ dp_size = self.vllm_config.parallel_config.data_parallel_size randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1 @@ -1982,7 +1982,7 @@ def _dummy_run( if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) - self.drafter.dummy_run(num_tokens) + self.drafter.dummy_run(num_tokens, attn_metadata) logit_indices = np.cumsum(num_scheduled_tokens) - 1 return hidden_states, hidden_states[logit_indices] From 53223d5326afa1c809933328d6f49e282db2da55 Mon Sep 17 00:00:00 2001 From: qizixi Date: Fri, 27 Jun 2025 09:00:01 -0700 Subject: [PATCH 02/63] run eagle with full cudagraph Signed-off-by: qizixi --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4c14ac3be3c0..d98bcf06cc70 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1996,7 +1996,7 @@ def maybe_randomize_inputs(self, input_ids: torch.Tensor): Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set. This is to help balance expert-selection - during profile_run - - during DP rank dummy run + - during DP rank dummy run """ dp_size = self.vllm_config.parallel_config.data_parallel_size randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1 @@ -2125,7 +2125,7 @@ def _dummy_run( if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) - self.drafter.dummy_run(num_tokens) + self.drafter.dummy_run(num_tokens, attn_metadata) # This is necessary to avoid blocking DP. # For dummy runs, we typically skip EPLB since we don't have any real From c38e003e543a479e82847424e8ffc9b9311790bc Mon Sep 17 00:00:00 2001 From: qizixi Date: Tue, 22 Jul 2025 21:44:26 -0700 Subject: [PATCH 03/63] rebase and add unit test Signed-off-by: qizixi --- examples/offline_inference/spec_decode.py | 6 +++ tests/v1/e2e/test_spec_decode.py | 5 ++ vllm/v1/spec_decode/eagle.py | 58 ++++++++++++++--------- 3 files changed, 46 insertions(+), 23 deletions(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index ce735f3b27df..3f9d59fecb47 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json + from transformers import AutoTokenizer from vllm import LLM, SamplingParams @@ -35,6 +37,7 @@ def parse_args(): parser.add_argument("--output-len", type=int, default=256) parser.add_argument("--model-dir", type=str, default=None) parser.add_argument("--eagle-dir", type=str, default=None) + parser.add_argument("--compilation-config", type=str, default="") return parser.parse_args() @@ -85,6 +88,9 @@ def main(): speculative_config=speculative_config, disable_log_stats=False, max_model_len=16384, + compilation_config=( + json.loads(args.compilation_config) if args.compilation_config else None + ), ) sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 2423f966acfa..53af0c0aab73 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -114,11 +114,13 @@ def test_ngram_correctness( marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), ], ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle"]) +@pytest.mark.parametrize("full_cuda_graph", [True, False]) def test_eagle_correctness( monkeypatch: pytest.MonkeyPatch, test_prompts: list[list[dict[str, Any]]], sampling_config: SamplingParams, model_setup: tuple[str, str, str, int], + full_cuda_graph: bool, ): ''' Compare the outputs of a original LLM and a speculative LLM @@ -148,6 +150,9 @@ def test_eagle_correctness( "max_model_len": 2048, }, max_model_len=2048, + compilation_config={ + "full_cuda_graph": full_cuda_graph + }, ) spec_outputs = spec_llm.chat(test_prompts, sampling_config) matches = 0 diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 5e02f7e1a051..9b07dd16be1d 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import numpy as np from typing import Any, Optional +import numpy as np import torch import torch.nn as nn @@ -59,7 +59,6 @@ def __init__( self.cudagraph_batch_sizes = list( reversed( self.vllm_config.compilation_config.cudagraph_capture_sizes)) - # persistent buffers for cuda graph self.input_ids = torch.zeros(self.max_num_tokens, dtype=torch.int32, @@ -71,13 +70,14 @@ def __init__( (self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device) + # attention metadata captured in full cudagraph mode + self.attn_metadata_cudagraph = None # We need +1 here because the arange is used to set query_start_loc, # which has one more element than batch_size. self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs + 1, device=device, dtype=torch.int32) - self.draft_attn_metadata = None def propose( self, @@ -111,11 +111,14 @@ def propose( assert self.runner is not None + use_attn_cudagraph = self.vllm_config.compilation_config.full_cuda_graph + # FIXME: need to consider multiple kv_cache_groups attn_metadata = self.runner.attn_metadata_builders[0].build( common_prefix_len=0, common_attn_metadata=common_attn_metadata, - fast_build=True, + fast_build= + not use_attn_cudagraph, # use fast build with eager mode attention ) # At this moment, we assume all eagle layers belong to the same KV @@ -131,14 +134,16 @@ def propose( # copy inputs to buffer for cudagraph self.positions[:num_tokens] = target_positions self.hidden_states[:num_tokens] = target_hidden_states - - # copy attention metadata for full cudagraph mode - if self.draft_attn_metadata is not None and num_tokens <= self.cudagraph_batch_sizes[-1]: - self.draft_attn_metadata.seq_lens[:attn_metadata.seq_lens.shape[0]].copy_(attn_metadata.seq_lens.clone()) - self.draft_attn_metadata.slot_mapping[:attn_metadata.slot_mapping.shape[0]].copy_(attn_metadata.slot_mapping.clone()) - self.draft_attn_metadata.query_start_loc[:attn_metadata.query_start_loc.shape[0]].copy_(attn_metadata.query_start_loc.clone()) - self.draft_attn_metadata.block_table[:attn_metadata.block_table.shape[0]].copy_(attn_metadata.block_table.clone()) - + if use_attn_cudagraph and num_tokens <= self.cudagraph_batch_sizes[-1]: + assert self.attn_metadata_cudagraph + self.attn_metadata_cudagraph.seq_lens[:batch_size] = ( + attn_metadata.seq_lens) + self.attn_metadata_cudagraph.slot_mapping[:num_tokens] = ( + attn_metadata.slot_mapping) + self.attn_metadata_cudagraph.query_start_loc[:batch_size + 1] = ( + attn_metadata.query_start_loc) + self.attn_metadata_cudagraph.block_table[:batch_size] = ( + attn_metadata.block_table) with set_forward_context(per_layer_attn_metadata, self.vllm_config, num_tokens=num_input_tokens): @@ -228,13 +233,20 @@ def propose( self.input_ids[:batch_size] = input_ids self.positions[:batch_size] = clamped_positions self.hidden_states[:batch_size] = hidden_states - - # copy attention metadata for full cudagraph mode - if self.draft_attn_metadata is not None: - self.draft_attn_metadata.seq_lens[:attn_metadata.seq_lens.shape[0]].copy_(attn_metadata.seq_lens.clone()) - self.draft_attn_metadata.slot_mapping[:attn_metadata.slot_mapping.shape[0]].copy_(attn_metadata.slot_mapping.clone()) - self.draft_attn_metadata.query_start_loc[:attn_metadata.query_start_loc.shape[0]].copy_(attn_metadata.query_start_loc.clone()) - self.draft_attn_metadata.block_table[:attn_metadata.block_table.shape[0]].copy_(attn_metadata.block_table.clone()) + if use_attn_cudagraph and batch_size <= self.cudagraph_batch_sizes[ + -1]: + assert self.attn_metadata_cudagraph + self.attn_metadata_cudagraph.seq_lens[:batch_size] = ( + attn_metadata.seq_lens) + self.attn_metadata_cudagraph.slot_mapping[:batch_size] = ( + attn_metadata.slot_mapping) + self.attn_metadata_cudagraph.query_start_loc[:batch_size + + 1] = ( + attn_metadata. + query_start_loc + ) + self.attn_metadata_cudagraph.block_table[:batch_size] = ( + attn_metadata.block_table) # Run the model. with set_forward_context(per_layer_attn_metadata, @@ -404,11 +416,11 @@ def load_model(self, target_model: nn.Module) -> None: def dummy_run( self, num_tokens: int, - attn_metadata: Optional[dict[str, Any]], + attn_metadata: Optional[dict[str, Any]] = None, ) -> None: - if attn_metadata is not None and self.draft_attn_metadata is None: - attn_metadata[self.attn_layer_names[0]].scheduler_metadata = None - self.draft_attn_metadata = attn_metadata[self.attn_layer_names[0]] # assume only one draft layer + if attn_metadata is not None and self.attn_metadata_cudagraph is None: + self.attn_metadata_cudagraph = attn_metadata[ + self.attn_layer_names[0]] with set_forward_context(attn_metadata, self.vllm_config, num_tokens=num_tokens): From f36f8c1f467e41d0c5b2b11c9fe83c0619adcce6 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 21 Jul 2025 13:47:51 -0400 Subject: [PATCH 04/63] Fix bad lm-eval fork (#21318) Signed-off-by: qizixi --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 114c48dba531..c476f71c6637 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -273,7 +273,7 @@ steps: # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - pytest -v -s v1/e2e # Integration test for streaming correctness (requires special branch). - - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - label: Examples Test # 25min From 41d76db01c6c460829a77336ce2538ef920b5705 Mon Sep 17 00:00:00 2001 From: Himanshu Jaju Date: Mon, 21 Jul 2025 19:19:23 +0100 Subject: [PATCH 05/63] [perf] Speed up align sum kernels (#21079) Signed-off-by: Himanshu Jaju Signed-off-by: qizixi --- .../kernels/benchmark_moe_align_block_size.py | 7 +- csrc/moe/moe_align_sum_kernels.cu | 71 ++++++++++++++----- .../layers/fused_moe/moe_align_block_size.py | 7 +- 3 files changed, 60 insertions(+), 25 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py index 5170ac09dc42..1af5a21caf46 100644 --- a/benchmarks/kernels/benchmark_moe_align_block_size.py +++ b/benchmarks/kernels/benchmark_moe_align_block_size.py @@ -33,15 +33,13 @@ def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8): sorted_ids_triton = torch.empty( (max_num_tokens_padded,), dtype=torch.int32, device="cuda" ) - sorted_ids_triton.fill_(topk_ids.numel()) # fill with sentinel value - expert_ids_triton = torch.zeros( + expert_ids_triton = torch.empty( (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda" ) num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda") sorted_ids_vllm = torch.empty_like(sorted_ids_triton) - sorted_ids_vllm.fill_(topk_ids.numel()) - expert_ids_vllm = torch.zeros_like(expert_ids_triton) + expert_ids_vllm = torch.empty_like(expert_ids_triton) num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton) # 2. run implementations @@ -102,7 +100,6 @@ def benchmark(num_tokens, num_experts, topk, provider): max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda") - sorted_ids.fill_(topk_ids.numel()) max_num_m_blocks = max_num_tokens_padded // block_size expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda") num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda") diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 462dbd1f8b38..8bbcf5a673fd 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -19,9 +20,14 @@ __global__ void moe_align_block_size_kernel( int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts, int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size, - size_t numel, int32_t* __restrict__ cumsum) { + size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded) { extern __shared__ int32_t shared_counts[]; + // Initialize sorted_token_ids with numel + for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) { + sorted_token_ids[it] = numel; + } + const int warp_id = threadIdx.x / WARP_SIZE; const int my_expert_start = warp_id * experts_per_warp; @@ -45,18 +51,27 @@ __global__ void moe_align_block_size_kernel( __syncthreads(); - if (threadIdx.x == 0) { - cumsum[0] = 0; - for (int i = 1; i <= num_experts; ++i) { - int expert_count = 0; - int warp_idx = (i - 1) / experts_per_warp; - int expert_offset = (i - 1) % experts_per_warp; - expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset]; + // Compute prefix sum over token counts per expert + using BlockScan = cub::BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; - cumsum[i] = - cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size; - } - *total_tokens_post_pad = cumsum[num_experts]; + int expert_count = 0; + int expert_id = threadIdx.x; + if (expert_id < num_experts) { + int warp_idx = expert_id / experts_per_warp; + int expert_offset = expert_id % experts_per_warp; + expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset]; + expert_count = CEILDIV(expert_count, block_size) * block_size; + } + + int cumsum_val; + BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val); + if (expert_id <= num_experts) { + cumsum[expert_id] = cumsum_val; + } + + if (expert_id == num_experts) { + *total_tokens_post_pad = cumsum_val; } __syncthreads(); @@ -67,6 +82,13 @@ __global__ void moe_align_block_size_kernel( expert_ids[i / block_size] = threadIdx.x; } } + + // Fill remaining expert_ids with 0 + const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x; + const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size); + for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) { + expert_ids[i] = 0; + } } template @@ -105,7 +127,12 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( const scalar_t* __restrict__ topk_ids, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts, - int32_t block_size, size_t numel) { + int32_t block_size, size_t numel, int32_t max_num_tokens_padded) { + // Initialize sorted_token_ids with numel + for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) { + sorted_token_ids[it] = numel; + } + const size_t tid = threadIdx.x; const size_t stride = blockDim.x; @@ -153,6 +180,13 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( } } + // Fill remaining expert_ids with 0 + const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x; + const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size); + for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) { + expert_ids[i] = 0; + } + for (size_t i = tid; i < numel; i += stride) { int32_t expert_id = topk_ids[i]; int32_t rank_post_pad = @@ -179,13 +213,17 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int threads = 1024; threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE; + // BlockScan uses 1024 threads and assigns one thread per expert. + TORCH_CHECK(padded_num_experts < 1024, + "padded_num_experts must be less than 1024"); + VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { // calc needed amount of shared mem for `cumsum` tensors auto options_int = torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); torch::Tensor cumsum_buffer = - torch::zeros({num_experts + 1}, options_int); + torch::empty({num_experts + 1}, options_int); bool small_batch_expert_mode = (topk_ids.numel() < 1024) && (num_experts <= 64); @@ -203,7 +241,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), num_experts, block_size, - topk_ids.numel()); + topk_ids.numel(), sorted_token_ids.size(0)); } else { auto align_kernel = vllm::moe::moe_align_block_size_kernel; @@ -217,7 +255,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), num_experts, padded_num_experts, experts_per_warp, block_size, - topk_ids.numel(), cumsum_buffer.data_ptr()); + topk_ids.numel(), cumsum_buffer.data_ptr(), + sorted_token_ids.size(0)); const int block_threads = std::min(256, (int)threads); const int num_blocks = diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py index 3aae183dfa20..2c9ad509fa98 100644 --- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py +++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py @@ -111,6 +111,8 @@ def moe_align_block_size_triton( dtype=torch.int32, device=topk_ids.device) tokens_per_thread = cdiv(numel, num_experts) + sorted_token_ids.fill_(numel) + expert_ids.zero_() moe_align_block_size_stage1[grid]( topk_ids, @@ -205,11 +207,8 @@ def moe_align_block_size( sorted_ids = torch.empty((max_num_tokens_padded, ), dtype=torch.int32, device=topk_ids.device) - sorted_ids.fill_(topk_ids.numel()) max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size) - # Expert ids must be zeroed out to prevent index out of bounds error while - # mapping global expert ids to local expert ids in expert parallelism. - expert_ids = torch.zeros((max_num_m_blocks, ), + expert_ids = torch.empty((max_num_m_blocks, ), dtype=torch.int32, device=topk_ids.device) num_tokens_post_pad = torch.empty((1), From 302677b663d63fbc023025f9ac081fbf793a549d Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Mon, 21 Jul 2025 13:47:47 -0700 Subject: [PATCH 06/63] [v1][sampler] Inplace logprobs comparison to get the token rank (#21283) Signed-off-by: Lu Fang Signed-off-by: qizixi --- vllm/v1/sample/ops/logprobs.py | 24 ++++++++++++++++++++++++ vllm/v1/sample/sampler.py | 3 ++- 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 vllm/v1/sample/ops/logprobs.py diff --git a/vllm/v1/sample/ops/logprobs.py b/vllm/v1/sample/ops/logprobs.py new file mode 100644 index 000000000000..a4d65485140e --- /dev/null +++ b/vllm/v1/sample/ops/logprobs.py @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Some utilities for logprobs, including logits.""" + +import torch + + +@torch.compile(dynamic=True) +def batched_count_greater_than(x: torch.Tensor, + values: torch.Tensor) -> torch.Tensor: + """ + Counts elements in each row of x that are greater than the corresponding + value in values. Use torch.compile to generate an optimized kernel for + this function. otherwise, it will create additional copies of the input + tensors and cause memory issues. + + Args: + x (torch.Tensor): A 2D tensor of shape (batch_size, n_elements). + values (torch.Tensor): A 2D tensor of shape (batch_size, 1). + + Returns: + torch.Tensor: A 1D tensor of shape (batch_size,) with the counts. + """ + return (x >= values).sum(-1) diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index e79e4451a3a3..fa078e628768 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -9,6 +9,7 @@ from vllm.v1.outputs import LogprobsTensors, SamplerOutput from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.ops.bad_words import apply_bad_words +from vllm.v1.sample.ops.logprobs import batched_count_greater_than from vllm.v1.sample.ops.penalties import apply_all_penalties from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler @@ -174,7 +175,7 @@ def gather_logprobs( token_logprobs = logprobs.gather(-1, token_ids) # Compute the ranks of the actual token. - token_ranks = (logprobs >= token_logprobs).sum(-1) + token_ranks = batched_count_greater_than(logprobs, token_logprobs) # Concatenate together with the topk. indices = torch.cat((token_ids, topk_indices), dim=1) From b60e53cb161a15be0c91b6ee726af2f52343989b Mon Sep 17 00:00:00 2001 From: Chaojun Zhang Date: Tue, 22 Jul 2025 12:47:35 +0800 Subject: [PATCH 07/63] [XPU] Enable external_launcher to serve as an executor via torchrun (#21021) Signed-off-by: chzhang Signed-off-by: qizixi --- vllm/v1/worker/xpu_worker.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index da271b2159af..c7885694f7a3 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -7,6 +7,7 @@ import vllm.envs as envs from vllm.config import VllmConfig +from vllm.distributed import get_world_group from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.platforms import current_platform @@ -155,7 +156,8 @@ def init_device(self): current_platform.dist_backend) # global all_reduce needed for overall oneccl warm up - torch.distributed.all_reduce(torch.zeros(1).xpu()) + torch.distributed.all_reduce(torch.zeros(1).xpu(), + group=get_world_group().device_group) # Set random seed. set_random_seed(self.model_config.seed) From 657be6103db641cb1cccbdce81e1c68a97e31989 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Tue, 22 Jul 2025 12:47:49 +0800 Subject: [PATCH 08/63] [Doc] Fix CPU doc format (#21316) Signed-off-by: jiang1.li Signed-off-by: qizixi --- docs/getting_started/installation/cpu.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index 5721195172dc..2d2598da943c 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -168,17 +168,18 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe ### How to do performance tuning for vLLM CPU? - - First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`. +First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`. - - Inference batch size is a important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM: - - `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as: - - Offline Inference: `4096 * world_size` - - Online Serving: `2048 * world_size` - - `--max-num-seqs`, defines the limit of sequence numbers in a single batch, has more impacts on the output token performance. - - Offline Inference: `256 * world_size` - - Online Serving: `128 * world_size` +Inference batch size is a important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM: - - vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more detials of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP togther if there are enough CPU sockets and memory nodes. +- `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as: + - Offline Inference: `4096 * world_size` + - Online Serving: `2048 * world_size` +- `--max-num-seqs`, defines the limit of sequence numbers in a single batch, has more impacts on the output token performance. + - Offline Inference: `256 * world_size` + - Online Serving: `128 * world_size` + +vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more detials of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP togther if there are enough CPU sockets and memory nodes. ### Which quantization configs does vLLM CPU support? From f226a8b01235a00ab55182ee4b17e1eba01ca1a3 Mon Sep 17 00:00:00 2001 From: Ratnam Parikh <114774508+ratnampa@users.noreply.github.com> Date: Mon, 21 Jul 2025 21:48:27 -0700 Subject: [PATCH 09/63] [Intel GPU] Ray Compiled Graph avoid NCCL for Intel GPU (#21338) Signed-off-by: ratnampa Signed-off-by: qizixi --- vllm/executor/ray_distributed_executor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index dec32f8e50fa..417750a08c69 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -67,8 +67,8 @@ def _init_executor(self) -> None: os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1" os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1" - # For TPU, avoid compiling NVIDIA's NCCL - if current_platform.is_tpu(): + # For TPU or XPU, avoid compiling NVIDIA's NCCL + if current_platform.is_tpu() or current_platform.is_xpu(): os.environ["VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE"] = "shm" # If the env var is set, it uses the Ray's compiled DAG API From e780c7d50bc31d0034a20bc3d22e2ca3e07641ea Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Mon, 21 Jul 2025 21:49:01 -0700 Subject: [PATCH 10/63] Revert "[Performance] Performance improvements in non-blockwise fp8 CUTLASS MoE (#20762) (#21334) Signed-off-by: Ming Yang Signed-off-by: qizixi --- .../kernels/benchmark_grouped_gemm_cutlass.py | 35 +---------- csrc/moe/moe_permute_unpermute_op.cu | 53 ++++------------ tests/kernels/moe/test_cutlass_moe.py | 14 +---- tests/kernels/moe/test_pplx_cutlass_moe.py | 22 ------- .../layers/fused_moe/cutlass_moe.py | 62 +++++++------------ .../compressed_tensors_moe.py | 26 +------- 6 files changed, 38 insertions(+), 174 deletions(-) diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index a6b42406b5cb..1d4e730f99ae 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -80,11 +80,6 @@ def bench_run( a, score, topk, renormalize=False ) - ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) - ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64) - c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64) - c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) - def run_triton_moe( a: torch.Tensor, w1: torch.Tensor, @@ -116,10 +111,6 @@ def run_cutlass_moe( w2: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, - ab_strides1: torch.Tensor, - ab_strides2: torch.Tensor, - c_strides1: torch.Tensor, - c_strides2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, per_act_token: bool, @@ -134,10 +125,6 @@ def run_cutlass_moe( topk_ids, w1_scale, w2_scale, - ab_strides1, - ab_strides2, - c_strides1, - c_strides2, per_act_token, a1_scale=None, ) @@ -149,10 +136,6 @@ def run_cutlass_from_graph( w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, - ab_strides1: torch.Tensor, - ab_strides2: torch.Tensor, - c_strides1: torch.Tensor, - c_strides2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, ): @@ -167,10 +150,6 @@ def run_cutlass_from_graph( topk_ids, w1_scale, w2_scale, - ab_strides1, - ab_strides2, - c_strides1, - c_strides2, per_act_token, a1_scale=None, ) @@ -215,10 +194,6 @@ def replay_graph(graph, num_repeats): w2_q, w1_scale, w2_scale, - ab_strides1, - ab_strides2, - c_strides1, - c_strides2, topk_weights, topk_ids, ) @@ -256,10 +231,6 @@ def replay_graph(graph, num_repeats): "w1_scale": w1_scale, "w2_scale": w2_scale, "per_act_token": per_act_token, - "ab_strides1": ab_strides1, - "ab_strides2": ab_strides2, - "c_strides1": c_strides1, - "c_strides2": c_strides2, # cuda graph params "cutlass_graph": cutlass_graph, "triton_graph": triton_graph, @@ -318,10 +289,6 @@ def replay_graph(graph, num_repeats): w2_q, w1_scale, w2_scale, - ab_strides1, - ab_strides2, - c_strides1, - c_strides2, topk_weights, topk_ids, per_act_token, @@ -330,7 +297,7 @@ def replay_graph(graph, num_repeats): results.append( benchmark.Timer( - stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 + stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu index 13aecd8007a4..a77471a7f207 100644 --- a/csrc/moe/moe_permute_unpermute_op.cu +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -160,30 +160,6 @@ __global__ void shuffleInputRowsKernel(const T* input, } } -template -__global__ void shuffleInputRowsKernelSlow(const T* input, - const int32_t* dst2src_map, - T* output, int64_t num_src_rows, - int64_t num_dst_rows, - int64_t num_cols) { - int64_t dest_row_idx = blockIdx.x; - int64_t const source_row_idx = dst2src_map[dest_row_idx]; - - if (blockIdx.x < num_dst_rows) { - // Duplicate and permute rows - auto const* source_row_ptr = input + source_row_idx * num_cols; - auto* dest_row_ptr = output + dest_row_idx * num_cols; - - int64_t const start_offset = threadIdx.x; - int64_t const stride = blockDim.x; - - for (int elem_index = start_offset; elem_index < num_cols; - elem_index += stride) { - dest_row_ptr[elem_index] = source_row_ptr[elem_index]; - } - } -} - void shuffle_rows(const torch::Tensor& input_tensor, const torch::Tensor& dst2src_map, torch::Tensor& output_tensor) { @@ -197,24 +173,17 @@ void shuffle_rows(const torch::Tensor& input_tensor, int64_t const num_src_rows = input_tensor.size(0); int64_t const num_cols = input_tensor.size(1); - if (num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)) { - // use slow kernel if num_cols can't be aligned to 128 bits - MOE_DISPATCH(input_tensor.scalar_type(), [&] { - shuffleInputRowsKernelSlow<<>>( - reinterpret_cast(input_tensor.data_ptr()), - dst2src_map.data_ptr(), - reinterpret_cast(output_tensor.data_ptr()), num_src_rows, - num_dest_rows, num_cols); - }); - } else { - MOE_DISPATCH(input_tensor.scalar_type(), [&] { - shuffleInputRowsKernel<<>>( - reinterpret_cast(input_tensor.data_ptr()), - dst2src_map.data_ptr(), - reinterpret_cast(output_tensor.data_ptr()), num_src_rows, - num_dest_rows, num_cols); - }); - } + TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)), + "num_cols must be divisible by 128 / " + "sizeof(input_tensor.scalar_type()) / 8"); + + MOE_DISPATCH(input_tensor.scalar_type(), [&] { + shuffleInputRowsKernel<<>>( + reinterpret_cast(input_tensor.data_ptr()), + dst2src_map.data_ptr(), + reinterpret_cast(output_tensor.data_ptr()), num_src_rows, + num_dest_rows, num_cols); + }); } #else diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 37727b75b077..81fb3ec1de18 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -207,10 +207,6 @@ def run_8_bit(moe_tensors: MOETensors8Bit, 'topk_ids': topk_ids, 'w1_scale': moe_tensors.w1_scale, 'w2_scale': moe_tensors.w2_scale, - 'ab_strides1': moe_tensors.ab_strides1, - 'ab_strides2': moe_tensors.ab_strides2, - 'c_strides1': moe_tensors.c_strides1, - 'c_strides2': moe_tensors.c_strides2, 'per_act_token': per_act_token, 'a1_scale': None #moe_tensors.a_scale } @@ -444,11 +440,6 @@ def test_run_cutlass_moe_fp8( expert_map[start:end] = list(range(num_local_experts)) expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda") - ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64) - ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64) - c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64) - c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64) - activation = lambda o, i: torch.ops._C.silu_and_mul(o, i) a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale, torch.float8_e4m3fn, @@ -457,9 +448,8 @@ def test_run_cutlass_moe_fp8( func = lambda output: run_cutlass_moe_fp8( output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation, global_num_experts, expert_map, mt.w1_scale, mt.w2_scale, - a1q_scale, None, ab_strides1, ab_strides2, c_strides1, c_strides2, - workspace13, workspace2, None, mt.a.dtype, per_act_token, - per_out_channel, False) + a1q_scale, None, workspace13, workspace2, None, mt.a.dtype, + per_act_token, per_out_channel, False) workspace13.random_() output_random_workspace = torch.empty(output_shape, diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index 77adc89ea9da..e4f4a393dfd5 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -75,7 +75,6 @@ def pplx_cutlass_moe( assert torch.cuda.current_device() == pgi.local_rank num_tokens, hidden_dim = a.shape - intermediate_dim = w2.shape[2] num_experts = w1.shape[0] block_size = hidden_dim # TODO support more cases device = pgi.device @@ -124,31 +123,10 @@ def pplx_cutlass_moe( num_local_experts=num_local_experts, num_dispatchers=num_dispatchers) - ab_strides1 = torch.full((num_local_experts, ), - hidden_dim, - device="cuda", - dtype=torch.int64) - ab_strides2 = torch.full((num_local_experts, ), - intermediate_dim, - device="cuda", - dtype=torch.int64) - c_strides1 = torch.full((num_local_experts, ), - 2 * intermediate_dim, - device="cuda", - dtype=torch.int64) - c_strides2 = torch.full((num_local_experts, ), - hidden_dim, - device="cuda", - dtype=torch.int64) - experts = CutlassExpertsFp8(num_local_experts, out_dtype, per_act_token, per_out_ch, - ab_strides1, - ab_strides2, - c_strides1, - c_strides2, num_dispatchers=num_dispatchers, use_batched_format=True) diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index ff49d7bb7801..2585a2953c9d 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -13,7 +13,8 @@ MoEPrepareAndFinalizeNoEP) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate) -from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize, +from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm, + _fp8_quantize, _resize_cache, extract_required_args) from vllm.scalar_type import scalar_types @@ -34,10 +35,6 @@ def run_cutlass_moe_fp8( w2_scale: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], a2_scale: Optional[torch.Tensor], - ab_strides1: torch.Tensor, - ab_strides2: torch.Tensor, - c_strides1: torch.Tensor, - c_strides2: torch.Tensor, workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], @@ -156,11 +153,27 @@ def run_cutlass_moe_fp8( problem_sizes1, problem_sizes2, a_map, c_map, global_num_experts, N, K) - a1q = ops.shuffle_rows(a1q, a_map) - a1q_scale = (ops.shuffle_rows(a1q_scale, a_map) - if per_act_token else a1q_scale) + a1q = _fp8_perm(a1q, a_map) + a1q_scale = a1q_scale[a_map] if per_act_token else a1q_scale expert_offsets = expert_offsets[:-1] + ab_strides1 = torch.full((w1.size(0), ), + K, + device=device, + dtype=torch.int64) + c_strides1 = torch.full((w1.size(0), ), + 2 * N, + device=device, + dtype=torch.int64) + ab_strides2 = torch.full((w1.size(0), ), + N, + device=device, + dtype=torch.int64) + c_strides2 = torch.full((w1.size(0), ), + K, + device=device, + dtype=torch.int64) + if use_batched_format: c1 = _resize_cache(workspace13, (local_E * padded_M, N * 2)) c2 = _resize_cache(workspace2, (local_E * padded_M, N)) @@ -197,8 +210,7 @@ def run_cutlass_moe_fp8( else: # We can't do this inplace because output may point to the same tensor # as c3. - output.copy_(ops.shuffle_rows(c3, c_map).view(M * topk, K), - non_blocking=True) + output.copy_(c3[c_map].view(M * topk, K), non_blocking=True) # TODO (bnell): split class batched vs. non-batched? @@ -211,10 +223,6 @@ def __init__( out_dtype: Optional[torch.dtype], per_act_token_quant: bool, per_out_ch_quant: bool, - ab_strides1: torch.Tensor, - ab_strides2: torch.Tensor, - c_strides1: torch.Tensor, - c_strides2: torch.Tensor, block_shape: Optional[list[int]] = None, num_dispatchers: Optional[int] = None, use_batched_format: bool = False, @@ -231,10 +239,6 @@ def __init__( self.max_experts_per_worker = max_experts_per_worker self.num_dispatchers = num_dispatchers self.out_dtype = out_dtype - self.ab_strides1 = ab_strides1 - self.ab_strides2 = ab_strides2 - self.c_strides1 = c_strides1 - self.c_strides2 = c_strides2 self.use_batched_format = use_batched_format @property @@ -314,8 +318,7 @@ def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, run_cutlass_moe_fp8( output, hidden_states, w1, w2, topk_ids, activation_callable, global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale, - a2_scale, self.ab_strides1, self.ab_strides2, self.c_strides1, - self.c_strides2, workspace13, workspace2, expert_num_tokens, + a2_scale, workspace13, workspace2, expert_num_tokens, self.out_dtype if self.out_dtype is not None else in_dtype, self.per_act_token_quant, self.per_out_ch_quant, self.use_batched_format) @@ -329,10 +332,6 @@ def cutlass_moe_fp8( topk_ids: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, - ab_strides1: torch.Tensor, - ab_strides2: torch.Tensor, - c_strides1: torch.Tensor, - c_strides2: torch.Tensor, per_act_token: Optional[bool] = None, activation: str = "silu", a1_scale: Optional[torch.Tensor] = None, @@ -360,17 +359,6 @@ def cutlass_moe_fp8( Shape: [num_experts] or [num_experts, 2N] - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q. Shape: [num_experts] or [num_experts, K] - - ab_strides1 (torch.Tensor): The input/weight strides for the first gemm. - Shape: [num_experts] - - ab_strides2 (torch.Tensor): The input/weight strides for the second gemm. - Shape: [num_experts] - - c_strides1 (torch.Tensor): The output strides for the first gemm. - Shape: [num_experts] - - c_strides2 (torch.Tensor): The output strides for the second gemm. - Shape: [num_experts] - - per_act_token (Optional[bool]): Whether the scale is per-token or - per-tensor. - - activation (str): The activation function to use. - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a. Shape: scalar or [M] - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to @@ -403,10 +391,6 @@ def cutlass_moe_fp8( out_dtype=a.dtype, per_act_token_quant=per_act_token, per_out_ch_quant=per_out_ch, - ab_strides1=ab_strides1, - ab_strides2=ab_strides2, - c_strides1=c_strides1, - c_strides2=c_strides2, use_batched_format=False, ), ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 1a31410c3385..2c93977beede 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -859,21 +859,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False) - device = layer.w13_weight.device - # ab_strides1 and c_strides2 are the same - self.ab_strides1_c_strides2 = torch.full((layer.local_num_experts, ), - layer.hidden_size, - device=device, - dtype=torch.int64) - self.ab_strides2 = torch.full((layer.local_num_experts, ), - layer.intermediate_size_per_partition, - device=device, - dtype=torch.int64) - self.c_strides1 = torch.full((layer.local_num_experts, ), - 2 * layer.intermediate_size_per_partition, - device=device, - dtype=torch.int64) - def select_gemm_impl( self, prepare_finalize: FusedMoEPrepareAndFinalize, @@ -896,10 +881,6 @@ def select_gemm_impl( moe.in_dtype, self.input_quant.strategy == QuantizationStrategy.TOKEN, self.weight_quant.strategy == QuantizationStrategy.CHANNEL, - ab_strides1=self.ab_strides1_c_strides2, - ab_strides2=self.ab_strides2, - c_strides1=self.c_strides1, - c_strides2=self.ab_strides1_c_strides2, num_dispatchers=num_dispatchers, use_batched_format=use_batched_format, ) @@ -946,8 +927,7 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype) + e_score_correction_bias=e_score_correction_bias) per_act_token = ( self.input_quant.strategy == QuantizationStrategy.TOKEN) @@ -968,10 +948,6 @@ def apply( expert_map=None if self.disable_expert_map else expert_map, w1_scale=layer.w13_weight_scale, w2_scale=layer.w2_weight_scale, - ab_strides1=self.ab_strides1_c_strides2, - ab_strides2=self.ab_strides2, - c_strides1=self.c_strides1, - c_strides2=self.ab_strides1_c_strides2, a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, ) From 49ad48502c63dda8e5fc373ffaf4d832708a7ebc Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Mon, 21 Jul 2025 22:37:34 -0700 Subject: [PATCH 11/63] [Core] Minimize number of dict lookup in _maybe_evict_cached_block (#21281) Signed-off-by: Jialin Ouyang Signed-off-by: qizixi --- vllm/v1/core/block_pool.py | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index d21f94727cf6..0fd6947ae0bd 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -243,22 +243,27 @@ def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool: True if the block is evicted, False otherwise. """ block_hash = block.block_hash - if block_hash and block_hash in self.cached_block_hash_to_block: - block.reset_hash() - del self.cached_block_hash_to_block[block_hash][block.block_id] - - if len(self.cached_block_hash_to_block[block_hash]) == 0: - del self.cached_block_hash_to_block[block_hash] - - if self.enable_kv_cache_events: - # FIXME (Chen): Not sure whether we should return `hash_value` - # or `(hash_value, group_id)` here. But it's fine now because - # we disable hybrid kv cache manager when kv cache event is - # enabled, so there is only one group. - self.kv_event_queue.append( - BlockRemoved(block_hashes=[block_hash.get_hash_value()])) - return True - return False + if block_hash is None: + # The block doesn't have hash, eviction is not needed + return False + blocks_by_id = self.cached_block_hash_to_block.get(block_hash) + if blocks_by_id is None: + # block_hash not found in cached_block_hash_to_block, + # eviction is not needed + return False + block.reset_hash() + blocks_by_id.pop(block.block_id, None) + if blocks_by_id: + del self.cached_block_hash_to_block[block_hash] + + if self.enable_kv_cache_events: + # FIXME (Chen): Not sure whether we should return `hash_value` + # or `(hash_value, group_id)` here. But it's fine now because + # we disable hybrid kv cache manager when kv cache event is + # enabled, so there is only one group. + self.kv_event_queue.append( + BlockRemoved(block_hashes=[block_hash.get_hash_value()])) + return True def touch(self, blocks: tuple[list[KVCacheBlock], ...]) -> None: """Touch a block increases its reference count by 1, and may remove From 0e5124d2426445673df2254ed50f6c1776494846 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Tue, 22 Jul 2025 08:31:18 +0200 Subject: [PATCH 12/63] [V1] [Hybrid] Add new test to verify that hybrid views into KVCacheTensor are compatible (#21300) Signed-off-by: Thomas Parnell Signed-off-by: qizixi --- tests/v1/worker/test_gpu_model_runner.py | 150 ++++++++++++++++++++++- 1 file changed, 149 insertions(+), 1 deletion(-) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 0bdf1f9820d3..6ddcbfea24ad 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -3,15 +3,19 @@ import random +import numpy as np import pytest import torch from vllm.attention import Attention from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig, set_current_vllm_config) +from vllm.distributed.parallel_state import (init_distributed_environment, + initialize_model_parallel) +from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams -from vllm.utils import GiB_bytes +from vllm.utils import GiB_bytes, update_environment_variables from vllm.v1.core.kv_cache_utils import (estimate_max_model_len, get_kv_cache_config) from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, @@ -686,3 +690,147 @@ def test_init_kv_cache_with_kv_sharing_valid(): assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2 assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0 assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1 + + +def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): + ''' + The GPU model runner creates different views into the + KVCacheTensors for the attention and mamba layers + (via _reshape_kv_cache_tensors function). This test verifies + that the views are compatible: writing a mamba block + will not corrupt an attention block and vice-versa + ''' + + current_platform.seed_everything(42) + + update_environment_variables({ + 'RANK': "0", + 'LOCAL_RANK': "0", + 'WORLD_SIZE': "1", + 'MASTER_ADDR': 'localhost', + 'MASTER_PORT': '12345', + }) + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=1) + torch.set_default_dtype(torch.float16) + + scheduler_config = SchedulerConfig( + max_num_seqs=10, + max_num_batched_tokens=512, + max_model_len=512, + ) + model_config = ModelConfig( + model="ibm-granite/granite-4.0-tiny-preview", + dtype="float16", + ) + cache_config = CacheConfig( + block_size=BLOCK_SIZE, + gpu_memory_utilization=0.9, + swap_space=0, + cache_dtype="auto", + ) + parallel_config = ParallelConfig() + vllm_config = VllmConfig( + model_config=model_config, + cache_config=cache_config, + scheduler_config=scheduler_config, + parallel_config=parallel_config, + ) + + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + layer_2 = "model.layers.2.mixer" + layer_3 = "model.layers.3.mixer" + layer_4 = "model.layers.4.mixer" + layer_5 = "model.layers.5.mixer" + + with set_current_vllm_config(vllm_config): + hf_config = vllm_config.model_config.hf_config + fwd_context = {} + for key in [layer_0, layer_1]: + fwd_context[key] = Attention( + num_heads=model_config.get_num_attention_heads( + parallel_config), + num_kv_heads=model_config.get_num_kv_heads(parallel_config), + head_size=model_config.get_head_size(), + scale=1.0, + prefix=key, + ) + for key in [layer_2, layer_3, layer_4, layer_5]: + fwd_context[key] = MambaMixer2( + hidden_size = hf_config.hidden_size, + ssm_state_size = hf_config.mamba_d_state, + conv_kernel_size = hf_config.mamba_d_conv, + intermediate_size = hf_config.mamba_expand *\ + hf_config.hidden_size, + use_conv_bias = hf_config.mamba_conv_bias, + use_bias = hf_config.mamba_proj_bias, + n_groups=hf_config.mamba_n_groups, + num_heads=hf_config.mamba_n_heads, + head_dim=hf_config.mamba_d_head, + rms_norm_eps=hf_config.rms_norm_eps, + activation=hf_config.hidden_act, + prefix=key, + ) + # suppress var not used error + assert fwd_context is not None + vllm_ctx = vllm_config.compilation_config.static_forward_context + + with monkeypatch.context() as m: + + m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") + + runner = GPUModelRunner(vllm_config, DEVICE) + kv_cache_spec = runner.get_kv_cache_spec() + + available_memory = 5 * GiB_bytes + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + available_memory) + runner.initialize_kv_cache(kv_cache_config) + + # random partition of blocks + # blocks0 will be assigned to attention layers + # blocks1 will be assigned to mamba layers + num_blocks = kv_cache_config.num_blocks + ind = np.arange(num_blocks) + np.random.shuffle(ind) + blocks0, blocks1 = ind[:(num_blocks // 2)], ind[(num_blocks // 2):] + + attn_shape = vllm_ctx[layer_0].kv_cache[0].shape + conv_shape = vllm_ctx[layer_2].kv_cache[0][0].shape + ssm_shape = vllm_ctx[layer_2].kv_cache[0][1].shape + + # assert we are using FlashInfer + assert attn_shape[0] == num_blocks + + attn_blocks_constant = torch.full((len(blocks0), *attn_shape[1:]), + device=DEVICE, + fill_value=3.33) + conv_blocks_constant = torch.full((len(blocks1), *conv_shape[1:]), + device=DEVICE, + fill_value=6.66) + ssm_blocks_constant = torch.full((len(blocks1), *ssm_shape[1:]), + device=DEVICE, + fill_value=9.99) + + # fill all attention blocks with constant + for layer in [layer_0, layer_1]: + vllm_ctx[layer].kv_cache[0][ + blocks0, :] = attn_blocks_constant.detach().clone() + + # fill all mamba blocks with constant + for layer in [layer_2, layer_3, layer_4, layer_5]: + vllm_ctx[layer].kv_cache[0][0][ + blocks1, :] = conv_blocks_constant.detach().clone() + vllm_ctx[layer].kv_cache[0][1][ + blocks1, :] = ssm_blocks_constant.detach().clone() + + # verify attention and mamba contents are correct + for layer in [layer_0, layer_1]: + assert torch.equal(vllm_ctx[layer].kv_cache[0][blocks0, :], + attn_blocks_constant) + for layer in [layer_2, layer_3, layer_4, layer_5]: + assert torch.equal(vllm_ctx[layer].kv_cache[0][0][blocks1, :], + conv_blocks_constant) + assert torch.equal(vllm_ctx[layer].kv_cache[0][1][blocks1, :], + ssm_blocks_constant) From d96a375a43323af97d1b4b6dab99ce14fa1b0c6d Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 22 Jul 2025 02:33:51 -0400 Subject: [PATCH 13/63] [Refactor] Fix Compile Warning #1444-D (#21208) Signed-off-by: yewentao256 Signed-off-by: qizixi --- csrc/moe/topk_softmax_kernels.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 064b76c9cd42..ea4ff67ef3e4 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -20,6 +20,7 @@ #include #include #include "../cuda_compat.h" +#include #ifndef USE_ROCM #include @@ -62,7 +63,7 @@ __launch_bounds__(TPB) __global__ const int thread_row_offset = blockIdx.x * num_cols; - cub::Sum sum; + cuda::std::plus sum; float threadData(-FLT_MAX); // Don't touch finished rows. From 8a8f6bd8fea882279149de239b9c26ea821e5ef3 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 22 Jul 2025 08:35:14 +0200 Subject: [PATCH 14/63] Fix kv_cache_dtype handling for out-of-tree HPU plugin (#21302) Signed-off-by: Konrad Zawora Signed-off-by: Chendi.Xue Co-authored-by: Chendi.Xue Signed-off-by: qizixi --- vllm/engine/arg_utils.py | 18 ++---------------- vllm/platforms/cuda.py | 13 +++++++++++++ vllm/platforms/interface.py | 7 +++++++ vllm/platforms/rocm.py | 4 ++++ vllm/platforms/tpu.py | 4 ++++ 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 28b1c1c363a7..1f74d22d07c1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1352,22 +1352,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: # No Fp8 KV cache so far. if self.kv_cache_dtype != "auto": - fp8_attention = self.kv_cache_dtype.startswith("fp8") - will_use_fa = ( - current_platform.is_cuda() - and not envs.is_set("VLLM_ATTENTION_BACKEND") - ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1" - supported = False - if (current_platform.is_rocm() - or (current_platform.is_cuda() - and current_platform.is_device_capability(100)) - or current_platform.is_tpu()): - supported = True - elif fp8_attention and will_use_fa: - from vllm.attention.utils.fa_utils import ( - flash_attn_supports_fp8) - supported = flash_attn_supports_fp8() - + supported = current_platform.is_kv_cache_dtype_supported( + self.kv_cache_dtype) if not supported: _raise_or_fallback(feature_name="--kv-cache-dtype", recommend_to_remove=False) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 962e2b3aab60..fdf1f46e603b 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -586,6 +586,19 @@ def is_fully_connected(cls, physical_device_ids: list[int]) -> bool: " not found. Assuming no NVLink available.") return False + @classmethod + def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool: + fp8_attention = kv_cache_dtype.startswith("fp8") + will_use_fa = (not envs.is_set("VLLM_ATTENTION_BACKEND") + ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1" + supported = False + if cls.is_device_capability(100): + supported = True + elif fp8_attention and will_use_fa: + from vllm.attention.utils.fa_utils import flash_attn_supports_fp8 + supported = flash_attn_supports_fp8() + return supported + # Autodetect either NVML-enabled or non-NVML platform # based on whether NVML is available. diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 1cd5cb5e83db..02cc392244ba 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -543,6 +543,13 @@ def stateless_init_device_torch_dist_pg( """ raise RuntimeError(f"Unsupported torch distributed backend: {backend}") + @classmethod + def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool: + """ + Returns if the kv_cache_dtype is supported by the current platform. + """ + return False + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 0bf9262776b1..b2e69f60343f 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -454,3 +454,7 @@ def stateless_init_device_torch_dist_pg( @classmethod def device_count(cls) -> int: return cuda_device_count_stateless() + + @classmethod + def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool: + return True \ No newline at end of file diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index febc6ae4662b..146801c9d773 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -190,6 +190,10 @@ def validate_request( and params.sampling_type == SamplingType.RANDOM_SEED): raise ValueError("Torch XLA does not support per-request seed.") + @classmethod + def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool: + return True + try: from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform From b1373c2fa46af1025ccd7b7506ec6c202a9a5f48 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Tue, 22 Jul 2025 12:05:45 +0530 Subject: [PATCH 15/63] [Misc] DeepEPHighThroughtput - Enable Inductor pass (#21311) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath Signed-off-by: qizixi --- vllm/platforms/cuda.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index fdf1f46e603b..cc2543538d0d 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -182,9 +182,6 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: compilation_config.use_cudagraph = False if model_config is not None: model_config.enforce_eager = True - # TODO (varun): Turning this ON gives incorrect results for the - # Deepseek-V2-lite model. - vllm_config.compilation_config.use_inductor = False @classmethod def get_current_memory_usage(cls, From 8b8a283041225c76058dcc684039d320a708e925 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 22 Jul 2025 02:36:18 -0400 Subject: [PATCH 16/63] [Bug] DeepGemm: Fix Cuda Init Error (#21312) Signed-off-by: yewentao256 Signed-off-by: qizixi --- vllm/utils/deep_gemm.py | 54 ++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 8b5713e02c95..09a12a8c11c5 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -45,30 +45,36 @@ def _resolve_symbol(module, new: str, old: str) -> Callable[..., Any] | None: return None -if not has_deep_gemm(): - _fp8_gemm_nt_impl: Callable[..., Any] | None = None - _grouped_impl: Callable[..., Any] | None = None - _grouped_masked_impl: Callable[..., Any] | None = None - _per_block_cast_impl: Callable[..., Any] | None = None -else: - _dg = importlib.import_module("deep_gemm") # type: ignore - - _fp8_gemm_nt_impl = _resolve_symbol( - _dg, - "fp8_gemm_nt", - "gemm_fp8_fp8_bf16_nt", - ) +_fp8_gemm_nt_impl: Callable[..., Any] | None = None +_grouped_impl: Callable[..., Any] | None = None +_grouped_masked_impl: Callable[..., Any] | None = None +_per_block_cast_impl: Callable[..., Any] | None = None + + +def _lazy_init() -> None: + """Import deep_gemm and resolve symbols on first use.""" + global _fp8_gemm_nt_impl, _grouped_impl, _grouped_masked_impl, \ + _per_block_cast_impl + + # fast path + if (_fp8_gemm_nt_impl is not None or _grouped_impl is not None + or _grouped_masked_impl is not None + or _per_block_cast_impl is not None): + return + + if not has_deep_gemm(): + return + + _dg = importlib.import_module("deep_gemm") + + _fp8_gemm_nt_impl = _resolve_symbol(_dg, "fp8_gemm_nt", + "gemm_fp8_fp8_bf16_nt") _grouped_impl = _resolve_symbol( - _dg, - "m_grouped_fp8_gemm_nt_contiguous", - "m_grouped_gemm_fp8_fp8_bf16_nt_contiguous", - ) + _dg, "m_grouped_fp8_gemm_nt_contiguous", + "m_grouped_gemm_fp8_fp8_bf16_nt_contiguous") _grouped_masked_impl = _resolve_symbol( - _dg, - "fp8_m_grouped_gemm_nt_masked", - "m_grouped_gemm_fp8_fp8_bf16_nt_masked", - ) - + _dg, "fp8_m_grouped_gemm_nt_masked", + "m_grouped_gemm_fp8_fp8_bf16_nt_masked") # Try to get per_token_cast_to_fp8 from DeepGEMM math utils. try: _math_mod = importlib.import_module( @@ -80,24 +86,28 @@ def _resolve_symbol(module, new: str, old: str) -> Callable[..., Any] | None: def fp8_gemm_nt(*args, **kwargs): + _lazy_init() if _fp8_gemm_nt_impl is None: return _missing(*args, **kwargs) return _fp8_gemm_nt_impl(*args, **kwargs) def m_grouped_fp8_gemm_nt_contiguous(*args, **kwargs): + _lazy_init() if _grouped_impl is None: return _missing(*args, **kwargs) return _grouped_impl(*args, **kwargs) def fp8_m_grouped_gemm_nt_masked(*args, **kwargs): + _lazy_init() if _grouped_masked_impl is None: return _missing(*args, **kwargs) return _grouped_masked_impl(*args, **kwargs) def per_block_cast_to_fp8(x, *args, **kwargs): + _lazy_init() if _per_block_cast_impl is not None and is_blackwell_deep_gemm_used(): return _per_block_cast_impl(x, use_ue8m0=True) # TODO: refactor the `per_block_cast_to_fp8` from tests to vllm utils From f0ea54f21ca9ad1bc1d0941d53724a772f52f9ab Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 22 Jul 2025 01:40:21 -0500 Subject: [PATCH 17/63] Update fp4 quantize API (#21327) Signed-off-by: Shu Wang Signed-off-by: qizixi --- .../layers/fused_moe/flashinfer_cutlass_moe.py | 10 +++++----- .../fused_moe/flashinfer_cutlass_prepare_finalize.py | 4 ++-- vllm/utils/flashinfer.py | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 1753c4f6e238..3e79a1a8c24b 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -181,12 +181,12 @@ def apply( g2_alphas, ] _ = flashinfer_cutlass_fused_moe( - hidden_states, - topk_ids.to(torch.int), - topk_weights, + input=hidden_states, + token_selected_experts=topk_ids.to(torch.int), + token_final_scales=topk_weights, # FlashInfer API requires weight to be long for nvfp4 - w1.view(torch.long), - w2.view(torch.long), + fc1_expert_weights=w1.view(torch.long), + fc2_expert_weights=w2.view(torch.long), output_dtype=out_dtype, quant_scales=quant_scales, input_sf=a1q_scale, diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py index 49819504c8ec..e658990e95e5 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -11,7 +11,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.utils import ( extract_required_args, moe_kernel_quantize_input) -from vllm.utils.flashinfer import fp4_swizzle_blockscale +from vllm.utils.flashinfer import block_scale_interleave def get_local_sizes(local_tokens): @@ -92,7 +92,7 @@ def prepare( dim=0, sizes=get_local_sizes(local_tokens)) a1_m, a1_n = a1q.shape - a1q_scale = fp4_swizzle_blockscale(a1q_scale, a1_m, a1_n * 2) + a1q_scale = block_scale_interleave(a1q_scale) return a1q, a1q_scale, None, topk_ids, topk_weights diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index fd8b384a616f..1ddafbae7fc0 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -69,8 +69,8 @@ def wrapper(*args, **kwargs): flashinfer_cutlass_fused_moe = _lazy_import_wrapper("flashinfer.fused_moe", "cutlass_fused_moe") fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize") -fp4_swizzle_blockscale = _lazy_import_wrapper("flashinfer", - "fp4_swizzle_blockscale") +block_scale_interleave = _lazy_import_wrapper("flashinfer", + "block_scale_interleave") # Special case for autotune since it returns a context manager autotune = _lazy_import_wrapper( @@ -95,7 +95,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool: required_functions = [ ("flashinfer.fused_moe", "cutlass_fused_moe"), ("flashinfer", "fp4_quantize"), - ("flashinfer", "fp4_swizzle_blockscale"), + ("flashinfer", "block_scale_interleave"), ] for module_name, attr_name in required_functions: @@ -110,7 +110,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool: "flashinfer_trtllm_fp8_block_scale_moe", "flashinfer_cutlass_fused_moe", "fp4_quantize", - "fp4_swizzle_blockscale", + "block_scale_interleave", "autotune", "has_flashinfer_moe", "has_flashinfer_cutlass_fused_moe", From 759f3ba53595a91f8edeffd3f0d39759267281ce Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Tue, 22 Jul 2025 14:41:14 +0800 Subject: [PATCH 18/63] [Feature][eplb] add verify ep or tp or dp (#21102) Signed-off-by: rongfu.leng Signed-off-by: qizixi --- vllm/config.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index 3e6aa2a93e6a..d649eb75033f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2108,6 +2108,15 @@ def __post_init__(self) -> None: raise ValueError( "num_redundant_experts must be non-negative, but got " f"{self.num_redundant_experts}.") + if not self.enable_expert_parallel: + raise ValueError( + "enable_expert_parallel must be True to use EPLB.") + if self.tensor_parallel_size * self.data_parallel_size <= 1: + raise ValueError( + "EPLB requires tensor_parallel_size or data_parallel_size " + f"to be greater than 1, but got " + f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}." + ) else: if self.num_redundant_experts != 0: raise ValueError( From 7aea17416e6a540179e3e2bf89eb24d73b662f7e Mon Sep 17 00:00:00 2001 From: Raghav Ravishankar <113712354+alyosha-swamy@users.noreply.github.com> Date: Tue, 22 Jul 2025 13:27:43 +0530 Subject: [PATCH 19/63] Add arcee model (#21296) Signed-off-by: alyosha-swamy Signed-off-by: Jee Jee Li Co-authored-by: Jee Jee Li Signed-off-by: qizixi --- docs/models/supported_models.md | 1 + tests/models/registry.py | 2 + vllm/model_executor/models/arcee.py | 347 +++++++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 4 files changed, 351 insertions(+) create mode 100644 vllm/model_executor/models/arcee.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 33b297ef2d7d..13ebb03e787e 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -324,6 +324,7 @@ th { | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ | | `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ | | `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 19725acd6c45..8e3285aebbe7 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -135,6 +135,8 @@ def check_available_online( trust_remote_code=True), "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True), + "ArceeForCausalLM": _HfExamplesInfo("arcee-ai/AFM-4.5B-Base", + is_available_online=False), "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct", trust_remote_code=True), "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B", diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py new file mode 100644 index 000000000000..4e3ba107ba7e --- /dev/null +++ b/vllm/model_executor/models/arcee.py @@ -0,0 +1,347 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright 2023-2025 vLLM Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +# +# Inference-only Arcee (AFM) model – adds support for ReLU^2 feed-forward +# activation. + +from collections.abc import Iterable +from typing import Any, Optional, Union + +import torch +from torch import nn +from transformers import LlamaConfig + +from vllm.compilation.decorators import support_torch_compile +from vllm.distributed import get_pp_group +from vllm.model_executor.layers.activation import ReLUSquaredActivation +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsLoRA, SupportsPP +from .utils import (AutoWeightsLoader, PPMissingLayer, + make_empty_intermediate_tensors_factory, make_layers) + + +class ArceeMLP(nn.Module): + """Feed-forward layer for Arcee using ReLU^2 activation + (no gating as in LLaMA).""" + + def __init__(self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[Any] = None, + bias: bool = False, + prefix: str = "", + reduce_results: bool = True) -> None: + super().__init__() + # Single linear projection up to intermediate size + # (no separate gate projection) + self.up_proj = ColumnParallelLinear( + input_size=hidden_size, + output_size=intermediate_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.up_proj", + ) + # Down projection back to hidden size + self.down_proj = RowParallelLinear( + input_size=intermediate_size, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj", + ) + if hidden_act != "relu2": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only 'relu2' is supported for AFM.") + # Define ReLU^2 activation: (ReLU(x))^2 elementwise + self.act_fn = ReLUSquaredActivation() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, _ = self.up_proj(x) # Project to intermediate size + x = self.act_fn(x) # Apply ReLU^2 activation elementwise + x, _ = self.down_proj(x) # Project back down to hidden size + return x + + +class ArceeDecoderLayer(nn.Module): + """Transformer decoder block for Arcee, with self-attention and + ReLU^2 MLP.""" + + def __init__(self, + config: LlamaConfig, + cache_config: Optional[Any] = None, + quant_config: Optional[Any] = None, + prefix: str = "") -> None: + super().__init__() + self.hidden_size = config.hidden_size + # Rotary embedding parameters (reuse LLaMA defaults) + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is not None and getattr( + config, "original_max_position_embeddings", None): + rope_scaling["original_max_position_embeddings"] = ( + config.original_max_position_embeddings) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + # Determine if attention bias is needed (some variants use bias terms) + attention_bias = getattr(config, "attention_bias", False) or getattr( + config, "bias", False) + bias_o_proj = attention_bias + if hasattr(config, "qkv_bias"): + attention_bias = config.qkv_bias + + # Self-Attention (using LLaMA's attention structure) + from vllm.model_executor.models.llama import ( + LlamaAttention) # import here to avoid circular import + self.self_attn = LlamaAttention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=getattr(config, "num_key_value_heads", + config.num_attention_heads), + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + bias=attention_bias, + bias_o_proj=bias_o_proj, + cache_config=cache_config, + prefix=f"{prefix}.self_attn", + attn_type=getattr( + config, "attn_type", + "decoder"), # assume decoder (causal) unless specified + ) + # MLP with ReLU^2 activation + self.mlp = ArceeMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + bias=getattr(config, "mlp_bias", False), + prefix=f"{prefix}.mlp", + ) + # Layer normalization layers (RMSNorm as in LLaMA) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, positions: torch.Tensor, hidden_states: torch.Tensor, + residual: Optional[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor]: + # Self-Attention block + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + # Fused residual add + layernorm if supported + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn(positions=positions, + hidden_states=hidden_states) + # Feed-forward block + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +@support_torch_compile +class ArceeModel(nn.Module): + """The transformer model backbone for Arcee (embedding layer + stacked + decoder blocks + final norm).""" + + def __init__(self, + *, + vllm_config, + prefix: str = "", + layer_type: type[nn.Module] = ArceeDecoderLayer) -> None: + super().__init__() + config: LlamaConfig = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.quant_config = quant_config + self.config = config + self.vocab_size = config.vocab_size + self.org_vocab_size = config.vocab_size + + # Word embeddings (parallelized if using pipeline parallel) + if get_pp_group().is_first_rank or (config.tie_word_embeddings + and get_pp_group().is_last_rank): + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + quant_config=quant_config, + ) + else: + self.embed_tokens = PPMissingLayer( + ) # placeholder on non-embedding ranks + + # Build decoder layers across pipeline ranks + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: layer_type(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), + prefix=f"{prefix}.layers", + ) + # Final RMSNorm on the last pipeline stage + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + + # For optional capturing of intermediate hidden states + # (not used by default) + self.aux_hidden_state_layers: tuple[int, ...] = tuple() + + # Prepare factory for empty intermediate tensors + # (for pipeline scheduling) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None + ) -> Union[torch.Tensor, IntermediateTensors, tuple[torch.Tensor, + list[torch.Tensor]]]: + # Embedding lookup (on first pipeline rank) + if get_pp_group().is_first_rank: + hidden_states = (inputs_embeds if inputs_embeds is not None else + self.get_input_embeddings(input_ids)) + residual = None + else: + assert intermediate_tensors is not None, ( + "IntermediateTensors must be provided for non-first " + "pipeline ranks") + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + aux_hidden_states: list[torch.Tensor] = [] + for idx, layer in enumerate( + self.layers[self.start_layer:self.end_layer]): + if idx in self.aux_hidden_state_layers: + aux_hidden_states.append( + hidden_states + + residual) # capture pre-layer hidden state if needed + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + # Send intermediate results to the next pipeline stage + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + # On last rank: apply final layer norm + hidden_states, _ = self.norm(hidden_states, residual) + if len(aux_hidden_states) > 0: + return hidden_states, aux_hidden_states + return hidden_states + + +class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + """Arcee Model for causal language modeling, integrated with vLLM + runtime.""" + # Map fused module names to their sub-module components + # (for quantization and LoRA) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + } + + def __init__(self, *, vllm_config, prefix: str = "") -> None: + super().__init__() + config = vllm_config.model_config.hf_config + self.config = config + + # Initialize the inner Transformer model (ArceeModel) + self.model = ArceeModel(vllm_config=vllm_config, + prefix=f"{prefix}.model") + # On the last pipeline stage, set up the LM head and logits processor + if get_pp_group().is_last_rank: + # Determine vocabulary size (including any LoRA extra tokens + # for padded LM head) + self.unpadded_vocab_size = config.vocab_size + + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE, + quant_config=vllm_config.quant_config, + bias=getattr(config, "lm_head_bias", False), + prefix=f"{prefix}.lm_head", + ) + if config.tie_word_embeddings: + # Tie output weights with input embedding matrix + self.lm_head = self.lm_head.tie_weights( + self.model.embed_tokens) + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, + logit_scale) + else: + # Placeholder for lm_head on non-last ranks + self.lm_head = PPMissingLayer() + # Provide a reference to the model's method for generating empty + # tensors (used in pipeline parallel schedule) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None + ) -> Union[torch.Tensor, IntermediateTensors]: + # Forward pass through the Arcee model backbone + model_output = self.model(input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds) + return model_output + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata) -> Optional[torch.Tensor]: + # Compute final logits from hidden states (last pipeline rank only) + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + """Load weights into the model (delegates to inner model and handles + tied embeddings).""" + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + skip_substrs=["gate_proj"]) + # AutoWeightLoader handles weight name remapping, including fusing + # separate q_proj, k_proj, v_proj into qkv_proj + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index a85e8b0e7b1b..9d88b5fe82cf 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -33,6 +33,7 @@ # [Decoder-only] "AquilaModel": ("llama", "LlamaForCausalLM"), "AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2 + "ArceeForCausalLM": ("arcee", "ArceeForCausalLM"), "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"), "MiniMaxForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"), "MiniMaxText01ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"), From 7aa2bac13afbe1da7a73e35d790dbf605b957b57 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Tue, 22 Jul 2025 01:18:40 -0700 Subject: [PATCH 20/63] [Bugfix] Fix eviction cached blocked logic (#21357) Signed-off-by: simon-mo Signed-off-by: qizixi --- vllm/v1/core/block_pool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 0fd6947ae0bd..cbb6bb26822c 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -253,7 +253,7 @@ def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool: return False block.reset_hash() blocks_by_id.pop(block.block_id, None) - if blocks_by_id: + if len(blocks_by_id) == 0: del self.cached_block_hash_to_block[block_hash] if self.enable_kv_cache_events: From fae52353cfcc0f9e76c662aebfe8511487527231 Mon Sep 17 00:00:00 2001 From: Kebe Date: Tue, 22 Jul 2025 20:26:39 +0800 Subject: [PATCH 21/63] [Misc] Remove deprecated args in v0.10 (#21349) Signed-off-by: Kebe Signed-off-by: qizixi --- .../offline_inference/neuron_speculation.py | 1 - tests/neuron/2_core/test_mistral.py | 1 - tests/neuron/2_core/test_multi_lora.py | 2 -- vllm/engine/arg_utils.py | 21 ------------------- 4 files changed, 25 deletions(-) diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py index 26276cba202b..7fc22caee742 100644 --- a/examples/offline_inference/neuron_speculation.py +++ b/examples/offline_inference/neuron_speculation.py @@ -37,7 +37,6 @@ def initialize_llm(): max_num_seqs=4, max_model_len=2048, block_size=2048, - use_v2_block_manager=True, device="neuron", tensor_parallel_size=32, ) diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py index d02fff943e90..ff59be1725b6 100644 --- a/tests/neuron/2_core/test_mistral.py +++ b/tests/neuron/2_core/test_mistral.py @@ -9,7 +9,6 @@ def test_mistral(): tensor_parallel_size=2, max_num_seqs=4, max_model_len=128, - use_v2_block_manager=True, override_neuron_config={ "sequence_parallel_enabled": False, "skip_warmup": True diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py index 6b97f47d4db3..52ca9fe7b666 100644 --- a/tests/neuron/2_core/test_multi_lora.py +++ b/tests/neuron/2_core/test_multi_lora.py @@ -14,7 +14,6 @@ def test_llama_single_lora(): tensor_parallel_size=2, max_num_seqs=4, max_model_len=512, - use_v2_block_manager=True, override_neuron_config={ "sequence_parallel_enabled": False, "skip_warmup": True, @@ -57,7 +56,6 @@ def test_llama_multiple_lora(): tensor_parallel_size=2, max_num_seqs=4, max_model_len=512, - use_v2_block_manager=True, override_neuron_config={ "sequence_parallel_enabled": False, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1f74d22d07c1..1e3d46a8d96e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -313,7 +313,6 @@ class EngineArgs: CacheConfig.prefix_caching_hash_algo disable_sliding_window: bool = ModelConfig.disable_sliding_window disable_cascade_attn: bool = ModelConfig.disable_cascade_attn - use_v2_block_manager: bool = True swap_space: float = CacheConfig.swap_space cpu_offload_gb: float = CacheConfig.cpu_offload_gb gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization @@ -364,7 +363,6 @@ class EngineArgs: max_prompt_adapter_token: int = \ PromptAdapterConfig.max_prompt_adapter_token - device: Device = DeviceConfig.device num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight @@ -745,16 +743,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--max-prompt-adapter-token", **prompt_adapter_kwargs["max_prompt_adapter_token"]) - # Device arguments - device_kwargs = get_kwargs(DeviceConfig) - device_group = parser.add_argument_group( - title="DeviceConfig", - description=DeviceConfig.__doc__, - ) - device_group.add_argument("--device", - **device_kwargs["device"], - deprecated=True) - # Speculative arguments speculative_group = parser.add_argument_group( title="SpeculativeConfig", @@ -856,15 +844,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **vllm_kwargs["additional_config"]) # Other arguments - parser.add_argument('--use-v2-block-manager', - action='store_true', - default=True, - deprecated=True, - help='[DEPRECATED] block manager v1 has been ' - 'removed and SelfAttnBlockSpaceManager (i.e. ' - 'block manager v2) is now the default. ' - 'Setting this flag to True or False' - ' has no effect on vLLM behavior.') parser.add_argument('--disable-log-stats', action='store_true', help='Disable logging statistics.') From 25d0c72806d3c50b6b6eb18cb0b1f57d55c1dd0e Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Tue, 22 Jul 2025 05:27:18 -0700 Subject: [PATCH 22/63] [Core] Optimize update checks in LogitsProcessor (#21245) Signed-off-by: Jialin Ouyang Signed-off-by: qizixi --- vllm/v1/sample/logits_processor.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py index 3a4c25964e70..3a06e71057cd 100644 --- a/vllm/v1/sample/logits_processor.py +++ b/vllm/v1/sample/logits_processor.py @@ -335,14 +335,19 @@ def update_state(self, batch_update: Optional[BatchUpdate]): if not batch_update: return + needs_update: bool = False # Process added requests. - needs_update = bool(batch_update.added) for index, params, _ in batch_update.added: if isinstance(params, SamplingParams) and (lb := params.logit_bias): self.biases[index] = lb + needs_update = True else: - self.biases.pop(index, None) + # Drop biases metadata at batch index + if self.biases.pop(index, None) is not None: + # If a new request replaces an old request which + # specified biases, we should update processor tensors + needs_update = True if self.biases: # Process removed requests. @@ -419,7 +424,6 @@ def update_state(self, batch_update: Optional[BatchUpdate]): if batch_update: # Process added requests. - needs_update |= bool(batch_update.added) for index, params, output_tok_ids in batch_update.added: if (isinstance(params, SamplingParams) and (min_tokens := params.min_tokens) @@ -427,9 +431,13 @@ def update_state(self, batch_update: Optional[BatchUpdate]): # Replace request metadata at batch index self.min_toks[index] = (min_tokens, output_tok_ids, params.all_stop_token_ids) + needs_update = True else: - # Drop request metadata at batch index - self.min_toks.pop(index, None) + # Drop min_toks metadata at batch index + if self.min_toks.pop(index, None) is not None: + # If a new request replaces an old request which + # specified min_toks, we should update processor tensors + needs_update = True if self.min_toks: # Process removed requests. From c7f963b20ddee710bd3ed8f7601d8bc234c0b8fa Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Tue, 22 Jul 2025 05:28:00 -0700 Subject: [PATCH 23/63] [benchmark] Port benchmark request sent optimization to benchmark_serving (#21209) Signed-off-by: Jialin Ouyang Signed-off-by: qizixi --- benchmarks/benchmark_serving.py | 98 +-------------------------------- vllm/benchmarks/serve.py | 10 ++-- 2 files changed, 7 insertions(+), 101 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index f3a20842137e..c597fb1068ab 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -30,7 +30,7 @@ import random import time import warnings -from collections.abc import AsyncGenerator, Iterable +from collections.abc import Iterable from dataclasses import dataclass from datetime import datetime from typing import Any, Literal, Optional @@ -73,6 +73,7 @@ VisionArenaDataset, ) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json +from vllm.benchmarks.serve import get_request MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -107,101 +108,6 @@ class BenchmarkMetrics: percentiles_e2el_ms: list[tuple[float, float]] -def _get_current_request_rate( - ramp_up_strategy: Optional[Literal["linear", "exponential"]], - ramp_up_start_rps: Optional[int], - ramp_up_end_rps: Optional[int], - request_index: int, - total_requests: int, - request_rate: float, -) -> float: - if ( - ramp_up_strategy - and ramp_up_start_rps is not None - and ramp_up_end_rps is not None - ): - progress = request_index / max(total_requests - 1, 1) - if ramp_up_strategy == "linear": - increase = (ramp_up_end_rps - ramp_up_start_rps) * progress - return ramp_up_start_rps + increase - elif ramp_up_strategy == "exponential": - ratio = ramp_up_end_rps / ramp_up_start_rps - return ramp_up_start_rps * (ratio**progress) - else: - raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}") - return request_rate - - -async def get_request( - input_requests: list[SampleRequest], - request_rate: float, - burstiness: float = 1.0, - ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, - ramp_up_start_rps: Optional[int] = None, - ramp_up_end_rps: Optional[int] = None, -) -> AsyncGenerator[tuple[SampleRequest, float], None]: - """ - Asynchronously generates requests at a specified rate - with OPTIONAL burstiness and OPTIONAL ramp-up strategy. - - Args: - input_requests: - A list of input requests, each represented as a SampleRequest. - request_rate: - The rate at which requests are generated (requests/s). - burstiness (optional): - The burstiness factor of the request generation. - Only takes effect when request_rate is not inf. - Default value is 1, which follows a Poisson process. - Otherwise, the request intervals follow a gamma distribution. - A lower burstiness value (0 < burstiness < 1) results - in more bursty requests, while a higher burstiness value - (burstiness > 1) results in a more uniform arrival of requests. - ramp_up_strategy (optional): - The ramp-up strategy. Can be "linear" or "exponential". - If None, uses constant request rate (specified by request_rate). - ramp_up_start_rps (optional): - The starting request rate for ramp-up. - ramp_up_end_rps (optional): - The ending request rate for ramp-up. - """ - assert burstiness > 0, ( - f"A positive burstiness factor is expected, but given {burstiness}." - ) - # Convert to list to get length for ramp-up calculations - if isinstance(input_requests, Iterable) and not isinstance(input_requests, list): - input_requests = list(input_requests) - - total_requests = len(input_requests) - request_index = 0 - - for request in input_requests: - current_request_rate = _get_current_request_rate( - ramp_up_strategy, - ramp_up_start_rps, - ramp_up_end_rps, - request_index, - total_requests, - request_rate, - ) - - yield request, current_request_rate - - request_index += 1 - - if current_request_rate == float("inf"): - # If the request rate is infinity, then we don't need to wait. - continue - - theta = 1.0 / (current_request_rate * burstiness) - - # Sample the request interval from the gamma distribution. - # If burstiness is 1, it follows exponential distribution. - interval = np.random.gamma(shape=burstiness, scale=theta) - # The next request will be sent after the interval. - await asyncio.sleep(interval) - - def calculate_metrics( input_requests: list[SampleRequest], outputs: list[RequestFuncOutput], diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index a4d51936320b..f4506c9ce6f4 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -179,12 +179,12 @@ async def get_request( delay_ts = [delay * normalize_factor for delay in delay_ts] start_ts = time.time() - request_index = 0 for request_index, request in enumerate(input_requests): - current_ts = time.time() - sleep_interval_s = start_ts + delay_ts[request_index] - current_ts - if sleep_interval_s > 0: - await asyncio.sleep(sleep_interval_s) + if delay_ts[request_index] > 0: + current_ts = time.time() + sleep_interval_s = start_ts + delay_ts[request_index] - current_ts + if sleep_interval_s > 0: + await asyncio.sleep(sleep_interval_s) yield request, request_rates[request_index] From 40ab4c4d78d578ce4eea50efdd93be44a76c124f Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Tue, 22 Jul 2025 06:17:47 -0700 Subject: [PATCH 24/63] [Core] Introduce popleft_n and append_n in FreeKVCacheBlockQueue to further optimize block_pool (#21222) Signed-off-by: Jialin Ouyang Signed-off-by: qizixi --- tests/v1/core/test_kv_cache_utils.py | 105 +++++++++++++++++++++++++++ vllm/v1/core/block_pool.py | 40 +++++----- vllm/v1/core/kv_cache_utils.py | 58 +++++++++++++++ 3 files changed, 183 insertions(+), 20 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 68b060156901..ccdbe79dfea4 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -184,6 +184,111 @@ def test_free_kv_cache_block_queue_operations(): assert str(e.value) == "No free blocks available" +def test_free_kv_cache_block_queue_append_n(): + # Create an empty FreeKVCacheBlockQueue with these blocks + queue = FreeKVCacheBlockQueue([]) + blocks = [KVCacheBlock(block_id=i) for i in range(6)] + # Append 0 block + # fake_head->fake_tail + queue.append_n([]) + assert queue.num_free_blocks == 0 + assert (queue.fake_free_list_head.next_free_block + is queue.fake_free_list_tail) + assert (queue.fake_free_list_tail.prev_free_block + is queue.fake_free_list_head) + # Append 1 block + # fake_head->b0->fake_tail + queue.append_n(blocks[0:1]) + assert queue.num_free_blocks == 1 + assert queue.fake_free_list_head.next_free_block is blocks[0] + assert blocks[0].prev_free_block is queue.fake_free_list_head + assert blocks[0].next_free_block is queue.fake_free_list_tail + assert queue.fake_free_list_tail.prev_free_block is blocks[0] + # Append 2 blocks + # fake_head->b0->b4->b5->fake_tail + queue.append_n(blocks[4:6]) + assert queue.num_free_blocks == 3 + assert queue.fake_free_list_head.next_free_block is blocks[0] + assert blocks[0].prev_free_block is queue.fake_free_list_head + assert blocks[0].next_free_block is blocks[4] + assert blocks[4].prev_free_block is blocks[0] + assert blocks[4].next_free_block is blocks[5] + assert blocks[5].prev_free_block is blocks[4] + assert blocks[5].next_free_block is queue.fake_free_list_tail + assert queue.fake_free_list_tail.prev_free_block is blocks[5] + # Append 3 blocks + # fake_head->b0->b4->b5->b1->b2->b3->fake_tail + queue.append_n(blocks[1:4]) + assert queue.num_free_blocks == 6 + assert queue.fake_free_list_head.next_free_block is blocks[0] + assert blocks[0].prev_free_block is queue.fake_free_list_head + assert blocks[0].next_free_block is blocks[4] + assert blocks[4].prev_free_block is blocks[0] + assert blocks[4].next_free_block is blocks[5] + assert blocks[5].prev_free_block is blocks[4] + assert blocks[5].next_free_block is blocks[1] + assert blocks[1].prev_free_block is blocks[5] + assert blocks[1].next_free_block is blocks[2] + assert blocks[2].prev_free_block is blocks[1] + assert blocks[2].next_free_block is blocks[3] + assert blocks[3].prev_free_block is blocks[2] + assert blocks[3].next_free_block is queue.fake_free_list_tail + assert queue.fake_free_list_tail.prev_free_block is blocks[3] + + +def test_free_kv_cache_block_queue_popleft_n(): + blocks = [KVCacheBlock(block_id=i) for i in range(6)] + # Create a empty FreeKVCacheBlockQueue with these blocks + queue = FreeKVCacheBlockQueue( + [blocks[1], blocks[3], blocks[5], blocks[4], blocks[0], blocks[2]]) + assert queue.num_free_blocks == 6 + assert queue.fake_free_list_head.next_free_block is blocks[1] + assert blocks[1].prev_free_block is queue.fake_free_list_head + assert blocks[1].next_free_block is blocks[3] + assert blocks[3].prev_free_block is blocks[1] + assert blocks[3].next_free_block is blocks[5] + assert blocks[5].prev_free_block is blocks[3] + assert blocks[5].next_free_block is blocks[4] + assert blocks[4].prev_free_block is blocks[5] + assert blocks[4].next_free_block is blocks[0] + assert blocks[0].prev_free_block is blocks[4] + assert blocks[0].next_free_block is blocks[2] + assert blocks[2].prev_free_block is blocks[0] + assert blocks[2].next_free_block is queue.fake_free_list_tail + assert queue.fake_free_list_tail.prev_free_block is blocks[2] + + # Pop 0 block + # fake_head->b1->b3->b5->b4->b0->b2->fake_tail + assert len(queue.popleft_n(0)) == 0 + # Pop 1 block + # fake_head->b3->b5->b4->b0->b2->fake_tail + result_blocks = queue.popleft_n(1) + assert len(result_blocks) == 1 + assert result_blocks[0] is blocks[1] + for block in result_blocks: + assert block.prev_free_block is None + assert block.next_free_block is None + # Pop 2 blocks + # fake_head->b4->b0->b2->fake_tail + result_blocks = queue.popleft_n(2) + assert len(result_blocks) == 2 + assert result_blocks[0] is blocks[3] + assert result_blocks[1] is blocks[5] + for block in result_blocks: + assert block.prev_free_block is None + assert block.next_free_block is None + # Pop 3 blocks + # fake_head->fake_tail + result_blocks = queue.popleft_n(3) + assert len(result_blocks) == 3 + assert result_blocks[0] is blocks[4] + assert result_blocks[1] is blocks[0] + assert result_blocks[2] is blocks[2] + for block in result_blocks: + assert block.prev_free_block is None + assert block.next_free_block is None + + def test_free_kv_cache_block_queue_get_all_free_blocks(): # Create a list of KVCacheBlock objects blocks = [KVCacheBlock(block_id=i) for i in range(5)] diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index cbb6bb26822c..5bf4d3a2acb4 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -214,21 +214,18 @@ def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]: raise ValueError( f"Cannot get {num_blocks} free blocks from the pool") - ret: list[KVCacheBlock] = [] - idx = 0 - while idx < num_blocks: - # First allocate blocks. - curr_block = self.free_block_queue.popleft() - assert curr_block.ref_cnt == 0 - - # If the block is cached, evict it. - if self.enable_caching: - self._maybe_evict_cached_block(curr_block) - - curr_block.incr_ref() - ret.append(curr_block) - idx += 1 - + ret: list[KVCacheBlock] = self.free_block_queue.popleft_n(num_blocks) + + # In order to only iterate the list once, we duplicated code a bit + if self.enable_caching: + for block in ret: + self._maybe_evict_cached_block(block) + assert block.ref_cnt == 0 + block.ref_cnt += 1 + else: + for block in ret: + assert block.ref_cnt == 0 + block.ref_cnt += 1 return ret def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool: @@ -289,11 +286,14 @@ def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None: ordered_blocks: A list of blocks to free ordered by their eviction priority. """ - for block in ordered_blocks: - block.decr_ref() - # null_block should not be added to the free list. - if block.ref_cnt == 0 and not block.is_null: - self.free_block_queue.append(block) + # Materialize the iterable to allow multiple passes. + blocks_list = list(ordered_blocks) + for block in blocks_list: + block.ref_cnt -= 1 + self.free_block_queue.append_n([ + block for block in blocks_list + if block.ref_cnt == 0 and not block.is_null + ]) def reset_prefix_cache(self) -> bool: """Reset prefix cache. This function may be used in RLHF diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 457d95cc738b..198d79cfb420 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -154,6 +154,8 @@ class KVCacheBlock: # Whether the block is a null block that should never be cached. is_null: bool = False + # TODO(Jialin): For performance, let callers handle ref_cnt bumps to + # avoid function calls. def incr_ref(self): self.ref_cnt += 1 @@ -273,6 +275,39 @@ def popleft(self) -> KVCacheBlock: self.num_free_blocks -= 1 return first_block + def popleft_n(self, n: int) -> list[KVCacheBlock]: + """Pop the first n free blocks and reduce num_free_blocks by n. + + Args: + n: The number of blocks to pop. + + Returns: + A list of n free blocks. + """ + if n == 0: + return [] + assert self.num_free_blocks >= n + self.num_free_blocks -= n + + curr_block = self.fake_free_list_head.next_free_block + # Pop n blocks from the head of the list + ret = [] + for _ in range(n): + assert curr_block is not None + ret.append(curr_block) + last_block = curr_block + curr_block = curr_block.next_free_block + # Reset prev_free_block and next_free_block of all popped blocks + last_block.prev_free_block = None + last_block.next_free_block = None + + if curr_block is not None: + # The queue is not empty, connect the fake head to + # the new first block. + self.fake_free_list_head.next_free_block = curr_block + curr_block.prev_free_block = self.fake_free_list_head + return ret + def remove(self, block: KVCacheBlock) -> None: """Remove a block in the free list and reduce num_free_blocks by 1. @@ -315,6 +350,29 @@ def append(self, block: KVCacheBlock) -> None: self.num_free_blocks += 1 + def append_n(self, blocks: list[KVCacheBlock]) -> None: + """Put a list of blocks back into the free list + + Args: + blocks: The blocks to append. + """ + if len(blocks) == 0: + return + self.num_free_blocks += len(blocks) + + last_block = self.fake_free_list_tail.prev_free_block + assert last_block is not None, ( + "prev_free_block of fake_free_list_tail should always exist") + # Add inter-connections between consecutive blocks + for block in blocks: + block.prev_free_block = last_block + last_block.next_free_block = block + last_block = block + + # Connect the last block of to the fake tail + last_block.next_free_block = self.fake_free_list_tail + self.fake_free_list_tail.prev_free_block = last_block + def get_all_free_blocks(self) -> list[KVCacheBlock]: """Get all free blocks in the free list. Mainly used for testing. From 80634e8b0f66876f99fffdf49453610dbc891c70 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Tue, 22 Jul 2025 21:32:36 +0800 Subject: [PATCH 25/63] [Misc] unify variable for LLM instance v2 (#21356) Signed-off-by: Andy Xie Signed-off-by: qizixi --- tests/models/language/generation/test_gemma.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/language/generation/test_gemma.py b/tests/models/language/generation/test_gemma.py index 5be4ae874e61..60a4bc14be88 100644 --- a/tests/models/language/generation/test_gemma.py +++ b/tests/models/language/generation/test_gemma.py @@ -15,13 +15,13 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None: load_format="dummy", ) as llm: if model == "google/gemma-3-4b-it": - normalizers = llm.model.collective_rpc( + normalizers = llm.llm.collective_rpc( lambda self: self.model_runner.model.language_model.model. normalizer.cpu().item()) - config = llm.model.llm_engine.model_config.hf_config.text_config + config = llm.llm.llm_engine.model_config.hf_config.text_config else: - normalizers = llm.model.collective_rpc( + normalizers = llm.llm.collective_rpc( lambda self: self.model_runner.model.model.normalizer.cpu( ).item()) - config = llm.model.llm_engine.model_config.hf_config + config = llm.llm.llm_engine.model_config.hf_config assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3) From 46b75f4d99991b8931118a1fb632f21aeb444bdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Seznec?= Date: Tue, 22 Jul 2025 16:07:44 +0200 Subject: [PATCH 26/63] [perf] Add fused MLA QKV + strided layernorm (#21116) Signed-off-by: Mickael Seznec Co-authored-by: mgoin Signed-off-by: qizixi --- csrc/layernorm_kernels.cu | 63 +++++++++------ csrc/layernorm_quant_kernels.cu | 39 ++++++---- csrc/quantization/fp8/common.cu | 4 + tests/kernels/core/test_layernorm.py | 26 +++++-- vllm/model_executor/layers/linear.py | 78 ++++++++++++++++++- .../model_executor/layers/quantization/fp8.py | 13 +++- vllm/model_executor/models/deepseek_v2.py | 57 +++++++++----- 7 files changed, 214 insertions(+), 66 deletions(-) diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index d073dd6d2dee..f051eb070222 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -15,15 +15,16 @@ namespace vllm { // TODO(woosuk): Further optimize this kernel. template __global__ void rms_norm_kernel( - scalar_t* __restrict__ out, // [..., hidden_size] - const scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const int64_t input_stride, const scalar_t* __restrict__ weight, // [hidden_size] const float epsilon, const int num_tokens, const int hidden_size) { __shared__ float s_variance; float variance = 0.0f; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - const float x = (float)input[blockIdx.x * hidden_size + idx]; + const float x = (float)input[blockIdx.x * input_stride + idx]; variance += x * x; } @@ -37,7 +38,7 @@ __global__ void rms_norm_kernel( __syncthreads(); for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - float x = (float)input[blockIdx.x * hidden_size + idx]; + float x = (float)input[blockIdx.x * input_stride + idx]; out[blockIdx.x * hidden_size + idx] = ((scalar_t)(x * s_variance)) * weight[idx]; } @@ -50,7 +51,8 @@ __global__ void rms_norm_kernel( template __global__ std::enable_if_t<(width > 0) && _typeConvert::exists> fused_add_rms_norm_kernel( - scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ input, // [..., hidden_size] + const int64_t input_stride, scalar_t* __restrict__ residual, // [..., hidden_size] const scalar_t* __restrict__ weight, // [hidden_size] const float epsilon, const int num_tokens, const int hidden_size) { @@ -59,6 +61,7 @@ fused_add_rms_norm_kernel( static_assert(sizeof(_f16Vec) == sizeof(scalar_t) * width); const int vec_hidden_size = hidden_size / width; + const int64_t vec_input_stride = input_stride / width; __shared__ float s_variance; float variance = 0.0f; /* These and the argument pointers are all declared `restrict` as they are @@ -73,7 +76,8 @@ fused_add_rms_norm_kernel( for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { int id = blockIdx.x * vec_hidden_size + idx; - _f16Vec temp = input_v[id]; + int64_t strided_id = blockIdx.x * vec_input_stride + idx; + _f16Vec temp = input_v[strided_id]; temp += residual_v[id]; variance += temp.sum_squares(); residual_v[id] = temp; @@ -90,10 +94,11 @@ fused_add_rms_norm_kernel( for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { int id = blockIdx.x * vec_hidden_size + idx; + int64_t strided_id = blockIdx.x * vec_input_stride + idx; _f16Vec temp = residual_v[id]; temp *= s_variance; temp *= weight_v[idx]; - input_v[id] = temp; + input_v[strided_id] = temp; } } @@ -103,7 +108,8 @@ fused_add_rms_norm_kernel( template __global__ std::enable_if_t<(width == 0) || !_typeConvert::exists> fused_add_rms_norm_kernel( - scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ input, // [..., hidden_size] + const int64_t input_stride, scalar_t* __restrict__ residual, // [..., hidden_size] const scalar_t* __restrict__ weight, // [hidden_size] const float epsilon, const int num_tokens, const int hidden_size) { @@ -111,7 +117,7 @@ fused_add_rms_norm_kernel( float variance = 0.0f; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - scalar_t z = input[blockIdx.x * hidden_size + idx]; + scalar_t z = input[blockIdx.x * input_stride + idx]; z += residual[blockIdx.x * hidden_size + idx]; float x = (float)z; variance += x * x; @@ -129,7 +135,7 @@ fused_add_rms_norm_kernel( for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { float x = (float)residual[blockIdx.x * hidden_size + idx]; - input[blockIdx.x * hidden_size + idx] = + input[blockIdx.x * input_stride + idx] = ((scalar_t)(x * s_variance)) * weight[idx]; } } @@ -141,11 +147,12 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size] torch::Tensor& weight, // [hidden_size] double epsilon) { TORCH_CHECK(out.is_contiguous()); - TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(input.stride(-1) == 1); TORCH_CHECK(weight.is_contiguous()); int hidden_size = input.size(-1); int num_tokens = input.numel() / hidden_size; + int64_t input_stride = input.stride(-2); dim3 grid(num_tokens); dim3 block(std::min(hidden_size, 1024)); @@ -153,26 +160,29 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size] const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] { vllm::rms_norm_kernel<<>>( - out.data_ptr(), input.data_ptr(), + out.data_ptr(), input.data_ptr(), input_stride, weight.data_ptr(), epsilon, num_tokens, hidden_size); }); } -#define LAUNCH_FUSED_ADD_RMS_NORM(width) \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), "fused_add_rms_norm_kernel", [&] { \ - vllm::fused_add_rms_norm_kernel \ - <<>>(input.data_ptr(), \ - residual.data_ptr(), \ - weight.data_ptr(), epsilon, \ - num_tokens, hidden_size); \ +#define LAUNCH_FUSED_ADD_RMS_NORM(width) \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "fused_add_rms_norm_kernel", [&] { \ + vllm::fused_add_rms_norm_kernel \ + <<>>( \ + input.data_ptr(), input_stride, \ + residual.data_ptr(), weight.data_ptr(), \ + epsilon, num_tokens, hidden_size); \ }); void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size] torch::Tensor& residual, // [..., hidden_size] torch::Tensor& weight, // [hidden_size] double epsilon) { + TORCH_CHECK(residual.is_contiguous()); + TORCH_CHECK(weight.is_contiguous()); int hidden_size = input.size(-1); + int64_t input_stride = input.stride(-2); int num_tokens = input.numel() / hidden_size; dim3 grid(num_tokens); @@ -194,9 +204,16 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size] auto inp_ptr = reinterpret_cast(input.data_ptr()); auto res_ptr = reinterpret_cast(residual.data_ptr()); auto wt_ptr = reinterpret_cast(weight.data_ptr()); - bool ptrs_are_aligned = - inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0; - if (ptrs_are_aligned && hidden_size % 8 == 0) { + constexpr int vector_width = 8; + constexpr int req_alignment_bytes = + vector_width * 2; // vector_width * sizeof(bfloat16 or float16) (float32 + // falls back to non-vectorized version anyway) + bool ptrs_are_aligned = inp_ptr % req_alignment_bytes == 0 && + res_ptr % req_alignment_bytes == 0 && + wt_ptr % req_alignment_bytes == 0; + bool offsets_are_multiple_of_vector_width = + hidden_size % vector_width == 0 && input_stride % vector_width == 0; + if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) { LAUNCH_FUSED_ADD_RMS_NORM(8); } else { LAUNCH_FUSED_ADD_RMS_NORM(0); diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu index d595b9e889c8..0fd5849d9626 100644 --- a/csrc/layernorm_quant_kernels.cu +++ b/csrc/layernorm_quant_kernels.cu @@ -23,8 +23,9 @@ namespace vllm { // TODO(woosuk): Further optimize this kernel. template __global__ void rms_norm_static_fp8_quant_kernel( - fp8_type* __restrict__ out, // [..., hidden_size] - const scalar_t* __restrict__ input, // [..., hidden_size] + fp8_type* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const int input_stride, const scalar_t* __restrict__ weight, // [hidden_size] const float* __restrict__ scale, // [1] const float epsilon, const int num_tokens, const int hidden_size) { @@ -32,7 +33,7 @@ __global__ void rms_norm_static_fp8_quant_kernel( float variance = 0.0f; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - const float x = (float)input[blockIdx.x * hidden_size + idx]; + const float x = (float)input[blockIdx.x * input_stride + idx]; variance += x * x; } @@ -49,7 +50,7 @@ __global__ void rms_norm_static_fp8_quant_kernel( float const scale_inv = 1.0f / *scale; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - float x = (float)input[blockIdx.x * hidden_size + idx]; + float x = (float)input[blockIdx.x * input_stride + idx]; float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx]; out[blockIdx.x * hidden_size + idx] = scaled_fp8_conversion(out_norm, scale_inv); @@ -63,8 +64,9 @@ __global__ void rms_norm_static_fp8_quant_kernel( template __global__ std::enable_if_t<(width > 0) && _typeConvert::exists> fused_add_rms_norm_static_fp8_quant_kernel( - fp8_type* __restrict__ out, // [..., hidden_size] - scalar_t* __restrict__ input, // [..., hidden_size] + fp8_type* __restrict__ out, // [..., hidden_size] + scalar_t* __restrict__ input, // [..., hidden_size] + const int input_stride, scalar_t* __restrict__ residual, // [..., hidden_size] const scalar_t* __restrict__ weight, // [hidden_size] const float* __restrict__ scale, // [1] @@ -74,6 +76,7 @@ fused_add_rms_norm_static_fp8_quant_kernel( static_assert(sizeof(_f16Vec) == sizeof(scalar_t) * width); const int vec_hidden_size = hidden_size / width; + const int vec_input_stride = input_stride / width; __shared__ float s_variance; float variance = 0.0f; /* These and the argument pointers are all declared `restrict` as they are @@ -87,8 +90,9 @@ fused_add_rms_norm_static_fp8_quant_kernel( reinterpret_cast*>(weight); for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { + int stride_id = blockIdx.x * vec_input_stride + idx; int id = blockIdx.x * vec_hidden_size + idx; - _f16Vec temp = input_v[id]; + _f16Vec temp = input_v[stride_id]; temp += residual_v[id]; variance += temp.sum_squares(); residual_v[id] = temp; @@ -125,8 +129,9 @@ fused_add_rms_norm_static_fp8_quant_kernel( template __global__ std::enable_if_t<(width == 0) || !_typeConvert::exists> fused_add_rms_norm_static_fp8_quant_kernel( - fp8_type* __restrict__ out, // [..., hidden_size] - scalar_t* __restrict__ input, // [..., hidden_size] + fp8_type* __restrict__ out, // [..., hidden_size] + scalar_t* __restrict__ input, // [..., hidden_size] + const int input_stride, scalar_t* __restrict__ residual, // [..., hidden_size] const scalar_t* __restrict__ weight, // [hidden_size] const float* __restrict__ scale, // [1] @@ -135,7 +140,7 @@ fused_add_rms_norm_static_fp8_quant_kernel( float variance = 0.0f; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - scalar_t z = input[blockIdx.x * hidden_size + idx]; + scalar_t z = input[blockIdx.x * input_stride + idx]; z += residual[blockIdx.x * hidden_size + idx]; float x = (float)z; variance += x * x; @@ -169,7 +174,9 @@ void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size] torch::Tensor& weight, // [hidden_size] torch::Tensor& scale, // [1] double epsilon) { + TORCH_CHECK(out.is_contiguous()); int hidden_size = input.size(-1); + int input_stride = input.stride(-2); int num_tokens = input.numel() / hidden_size; dim3 grid(num_tokens); @@ -183,8 +190,9 @@ void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size] vllm::rms_norm_static_fp8_quant_kernel <<>>( out.data_ptr(), input.data_ptr(), - weight.data_ptr(), scale.data_ptr(), - epsilon, num_tokens, hidden_size); + input_stride, weight.data_ptr(), + scale.data_ptr(), epsilon, num_tokens, + hidden_size); }); }); } @@ -198,7 +206,7 @@ void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size] width, fp8_t> \ <<>>( \ out.data_ptr(), input.data_ptr(), \ - residual.data_ptr(), \ + input_stride, residual.data_ptr(), \ weight.data_ptr(), scale.data_ptr(), \ epsilon, num_tokens, hidden_size); \ }); \ @@ -210,7 +218,10 @@ void fused_add_rms_norm_static_fp8_quant( torch::Tensor& weight, // [hidden_size] torch::Tensor& scale, // [1] double epsilon) { + TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK(residual.is_contiguous()); int hidden_size = input.size(-1); + int input_stride = input.stride(-2); int num_tokens = input.numel() / hidden_size; dim3 grid(num_tokens); @@ -234,7 +245,7 @@ void fused_add_rms_norm_static_fp8_quant( auto wt_ptr = reinterpret_cast(weight.data_ptr()); bool ptrs_are_aligned = inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0; - if (ptrs_are_aligned && hidden_size % 8 == 0) { + if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) { LAUNCH_FUSED_ADD_RMS_NORM(8); } else { LAUNCH_FUSED_ADD_RMS_NORM(0); diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index f3f9f669e00a..0e1eab66f0b9 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -88,6 +88,8 @@ void static_scaled_fp8_quant(torch::Tensor& out, // [..., d] torch::Tensor const& input, // [..., d] torch::Tensor const& scale) // [1] { + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(out.is_contiguous()); int const block_size = 256; int const num_tokens = input.numel() / input.size(-1); int const num_elems = input.numel(); @@ -111,6 +113,8 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d] torch::Tensor const& input, // [..., d] torch::Tensor& scale) // [1] { + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(out.is_contiguous()); int const block_size = 256; int const num_tokens = input.numel() / input.size(-1); int const num_elems = input.numel(); diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py index 3eac062738f8..02316ceaac73 100644 --- a/tests/kernels/core/test_layernorm.py +++ b/tests/kernels/core/test_layernorm.py @@ -26,6 +26,7 @@ @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("strided_input", [False, True]) @torch.inference_mode() def test_rms_norm( num_tokens: int, @@ -34,13 +35,17 @@ def test_rms_norm( dtype: torch.dtype, seed: int, device: str, + strided_input: bool, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) layer = RMSNorm(hidden_size).to(dtype=dtype) layer.weight.data.normal_(mean=1.0, std=0.1) scale = 1 / (2 * hidden_size) - x = torch.randn(num_tokens, hidden_size, dtype=dtype) + last_dim = 2 * hidden_size if strided_input else hidden_size + x = torch.randn(num_tokens, last_dim, dtype=dtype) + x = x[..., :hidden_size] + assert x.is_contiguous() != strided_input x *= scale residual = torch.randn_like(x) * scale if add_residual else None @@ -72,6 +77,7 @@ def test_rms_norm( @pytest.mark.parametrize("quant_scale", [1.0, 0.01, 10.0]) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("strided_input", [False, True]) def test_fused_rms_norm_quant( num_tokens: int, hidden_size: int, @@ -80,13 +86,18 @@ def test_fused_rms_norm_quant( quant_scale: float, seed: int, device: str, + strided_input: bool, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1) scale = 1 / (2 * hidden_size) - x = torch.randn(num_tokens, hidden_size, dtype=dtype) + last_dim = 2 * hidden_size if strided_input else hidden_size + x_base = torch.randn(num_tokens, last_dim, dtype=dtype) + x = x_base[..., :hidden_size] + assert x.is_contiguous() != strided_input + x *= scale if add_residual: residual = torch.randn_like(x) * scale @@ -106,9 +117,11 @@ def test_fused_rms_norm_quant( # Unfused kernel is in-place so it goes second # Also use a separate clone of x to avoid modifying the input - x_unfused = x.clone() + x_unfused_base = x_base.clone() + x_unfused = x_unfused_base[..., :hidden_size] + assert x_unfused.is_contiguous() != strided_input torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6) - torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused, + torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused.contiguous(), quant_scale_t) torch.cuda.synchronize() @@ -116,7 +129,6 @@ def test_fused_rms_norm_quant( residual, atol=1e-2, rtol=1e-2) - opcheck( torch.ops._C.fused_add_rms_norm_static_fp8_quant, (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6)) @@ -131,7 +143,7 @@ def test_fused_rms_norm_quant( opcheck(torch.ops._C.rms_norm_static_fp8_quant, (out_quant_fused, x, weight, quant_scale_t, 1e-6)) - torch.testing.assert_close(out_quant_fused.to(dtype=torch.float32), - out_quant.to(dtype=torch.float32), + torch.testing.assert_close(out_quant.to(dtype=torch.float32), + out_quant_fused.to(dtype=torch.float32), atol=1e-3, rtol=1e-3) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 366dfd97d816..bb81a663d454 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -259,6 +259,8 @@ def __init__( if params_dtype is None: params_dtype = torch.get_default_dtype() self.params_dtype = params_dtype + self.quant_config = quant_config + self.prefix = prefix if quant_config is None: self.quant_method: Optional[ QuantizeMethodBase] = UnquantizedLinearMethod() @@ -300,6 +302,12 @@ def __init__( *, return_bias: bool = True, ): + # If MergedReplicatedLinear, use output size of each partition. + if hasattr(self, "output_sizes"): + self.output_partition_sizes = self.output_sizes + else: + self.output_partition_sizes = [output_size] + super().__init__(input_size, output_size, skip_bias_add, @@ -311,7 +319,8 @@ def __init__( # All the linear layer supports quant method. assert self.quant_method is not None self.quant_method.create_weights(self, - self.input_size, [self.output_size], + self.input_size, + self.output_partition_sizes, self.input_size, self.output_size, self.params_dtype, @@ -367,6 +376,73 @@ def extra_repr(self) -> str: return s +class MergedReplicatedLinear(ReplicatedLinear): + """Replicated linear layer. + + Args: + input_size: input dimension of the linear layer. + output_size: output dimension of the linear layer. + bias: If true, add bias. + skip_bias_add: If true, skip adding bias but instead return it. + params_dtype: Data type for the parameters. + quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) + """ + + def __init__( + self, + input_size: int, + output_sizes: list[int], + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + *, + return_bias: bool = True, + ): + self.output_sizes = output_sizes + super().__init__(input_size, + sum(output_sizes), + bias, + skip_bias_add, + params_dtype, + quant_config, + prefix=prefix, + return_bias=return_bias) + + def weight_loader(self, + param: Union[Parameter, BasevLLMParameter], + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[int] = None): + assert loaded_shard_id is not None + assert loaded_shard_id < len(self.output_sizes) + + if isinstance(param, BlockQuantScaleParameter): + from vllm.model_executor.layers.quantization.fp8 import ( + Fp8LinearMethod, Fp8MoEMethod) + assert self.quant_method is not None + assert isinstance(self.quant_method, + (Fp8LinearMethod, Fp8MoEMethod)) + weight_block_size = self.quant_method.quant_config.weight_block_size + assert weight_block_size is not None + block_n, _ = weight_block_size[0], weight_block_size[1] + shard_offset = ( + (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // + block_n) + shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) // + block_n) + elif isinstance(param, PerTensorScaleParameter): + shard_offset = loaded_shard_id + shard_size = 1 + else: + shard_offset = sum(self.output_sizes[:loaded_shard_id]) + shard_size = self.output_sizes[loaded_shard_id] + + param[shard_offset:shard_offset + shard_size] = loaded_weight + + class ColumnParallelLinear(LinearBase): """Linear layer with column parallelism. diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 35d7545d8c6a..75f8adf34f7d 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -257,9 +257,16 @@ def create_weights( f"{input_size_per_partition} is not divisible by " f"weight quantization block_k = {block_k}.") # Required by column parallel or enabling merged weights - if (tp_size > 1 and output_size // output_size_per_partition - == tp_size) or len(output_partition_sizes) > 1: - for output_partition_size in output_partition_sizes: + is_tp_split = (tp_size > 1 and + output_size // output_size_per_partition == tp_size) + is_merged_gemm = len(output_partition_sizes) > 1 + if is_tp_split or is_merged_gemm: + sizes_to_check = output_partition_sizes + if not is_tp_split and is_merged_gemm: + # In case of merged matrices, we allow the last + # matrix to not be a multiple of block size + sizes_to_check = output_partition_sizes[:-1] + for output_partition_size in sizes_to_check: if output_partition_size % block_n != 0: raise ValueError( f"Weight output_partition_size = " diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 5106b9914b5e..649109777b3f 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -42,6 +42,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, + MergedReplicatedLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -336,7 +337,7 @@ def forward( kv_a, _ = latent_cache.split( [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) latent_cache = latent_cache.unsqueeze(1) - kv_a = self.kv_a_layernorm(kv_a.contiguous()) + kv_a = self.kv_a_layernorm(kv_a) kv = self.kv_b_proj(kv_a)[0] kv = kv.view(-1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim) @@ -407,14 +408,24 @@ def __init__( self.max_position_embeddings = max_position_embeddings if self.q_lora_rank is not None: - self.q_a_proj = ReplicatedLinear(self.hidden_size, - self.q_lora_rank, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.q_a_proj") + self.fused_qkv_a_proj = MergedReplicatedLinear( + self.hidden_size, + [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim], + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.fused_qkv_a_proj") + else: + self.kv_a_proj_with_mqa = ReplicatedLinear( + self.hidden_size, + self.kv_lora_rank + self.qk_rope_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_a_proj_with_mqa") + + if self.q_lora_rank is not None: self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps) - self.q_b_proj = ColumnParallelLinear(q_lora_rank, + self.q_b_proj = ColumnParallelLinear(self.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False, @@ -427,13 +438,6 @@ def __init__( bias=False, quant_config=quant_config, prefix=f"{prefix}.q_proj") - - self.kv_a_proj_with_mqa = ReplicatedLinear( - self.hidden_size, - self.kv_lora_rank + self.qk_rope_head_dim, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.kv_a_proj_with_mqa") self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps) self.kv_b_proj = ColumnParallelLinear( @@ -495,15 +499,24 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, ) -> torch.Tensor: + q_c = None + kv_lora = None + if self.q_lora_rank is not None: - q_c = self.q_a_proj(hidden_states)[0] + qkv_lora = self.fused_qkv_a_proj(hidden_states)[0] + q_c, kv_lora = qkv_lora.split( + [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim], + dim=-1, + ) q_c = self.q_a_layernorm(q_c) q = self.q_b_proj(q_c)[0] else: + kv_lora = self.kv_a_proj_with_mqa(hidden_states)[0] q = self.q_proj(hidden_states)[0] - kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split( - [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) + + kv_c, k_pe = kv_lora.split([self.kv_lora_rank, self.qk_rope_head_dim], + dim=-1) + kv_c_normed = self.kv_a_layernorm(kv_c) q = q.view(-1, self.num_local_heads, self.qk_head_dim) # Add head dim of 1 to k_pe @@ -837,6 +850,8 @@ def load_weights(self, weights: Iterable[tuple[str, # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), + ("fused_qkv_a_proj", "q_a_proj", 0), + ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1), ] # Params for weights, fp8 weight scales, fp8 activation scales @@ -871,6 +886,12 @@ def load_weights(self, weights: Iterable[tuple[str, if (("mlp.experts." in name) and name not in params_dict): continue name = name.replace(weight_name, param_name) + + # QKV fusion is optional, fall back to normal + # weight loading if it's not enabled + if ((param_name == "fused_qkv_a_proj") + and name not in params_dict): + continue # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue From 29646b5b7a6bdad4b7db0c40bc067b05a9f58f7b Mon Sep 17 00:00:00 2001 From: Duncan Moss Date: Tue, 22 Jul 2025 07:27:12 -0700 Subject: [PATCH 27/63] [feat]: add SM100 support for cutlass FP8 groupGEMM (#20447) Signed-off-by: Duncan Moss Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: mgoin Signed-off-by: qizixi --- CMakeLists.txt | 22 ++- .../cutlass_w8a8/moe/grouped_mm_c3x.cuh | 13 +- .../cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu | 140 ++++++++++++++++++ ...ouped_mm_c3x.cu => grouped_mm_c3x_sm90.cu} | 30 ++-- .../quantization/cutlass_w8a8/moe/moe_data.cu | 2 +- .../cutlass_w8a8/scaled_mm_entry.cu | 45 ++++-- .../compressed_tensors/compressed_tensors.py | 6 + .../compressed_tensors_moe.py | 29 +++- 8 files changed, 255 insertions(+), 32 deletions(-) create mode 100644 csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu rename csrc/quantization/cutlass_w8a8/moe/{grouped_mm_c3x.cu => grouped_mm_c3x_sm90.cu} (88%) diff --git a/CMakeLists.txt b/CMakeLists.txt index edc64f87730a..10f8667db649 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -577,7 +577,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # if it's possible to compile MoE kernels that use its output. cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu") + set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_ARCHS}") @@ -595,6 +595,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() endif() + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) + set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${SCALED_MM_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1") + message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}") + else() + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) + message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is " + "not >= 12.8, we recommend upgrading to CUDA 12.8 or later " + "if you intend on running FP8 quantized MoE models on Blackwell.") + else() + message(STATUS "Not building grouped_mm_c3x as no compatible archs found " + "in CUDA target architectures.") + endif() + endif() + # moe_data.cu is used by all CUTLASS MoE kernels. cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS) diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh index 3225378a6ca0..659941de182e 100644 --- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh +++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh @@ -18,7 +18,6 @@ using ProblemShape = cutlass::gemm::GroupProblemShape>; using ElementAccumulator = float; -using ArchTag = cutlass::arch::Sm90; using OperatorClass = cutlass::arch::OpClassTensorOp; using LayoutA = cutlass::layout::RowMajor; @@ -33,7 +32,7 @@ using LayoutD_Transpose = using LayoutC = LayoutD; using LayoutC_Transpose = LayoutD_Transpose; -template typename Epilogue_, typename TileShape, typename ClusterShape, typename KernelSchedule, typename EpilogueSchedule, bool swap_ab_ = false> @@ -43,6 +42,7 @@ struct cutlass_3x_group_gemm { using ElementC = void; using ElementD = ElementC_; using ElementAccumulator = float; + using ArchTag = ArchTag_; using Epilogue = Epilogue_; @@ -77,7 +77,7 @@ struct cutlass_3x_group_gemm { LayoutB*, AlignmentAB, ElementAccumulator, TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp>; - using KernelType = enable_sm90_only>; struct GemmKernel : public KernelType {}; @@ -156,9 +156,14 @@ void cutlass_group_gemm_caller( static_cast(out_ptrs.data_ptr()), static_cast(c_strides.data_ptr())}; + int device_id = a_tensors.device().index(); + static const cutlass::KernelHardwareInfo hw_info{ + device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count( + device_id)}; + typename GemmKernel::Arguments args{ cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape, mainloop_args, - epilogue_args}; + epilogue_args, hw_info}; using GemmOp = cutlass::gemm::device::GemmUniversalAdapter; GemmOp gemm_op; diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu new file mode 100644 index 000000000000..641e5997f0fd --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu @@ -0,0 +1,140 @@ +#include + +#include +#include + +#include "cutlass/cutlass.h" +#include "grouped_mm_c3x.cuh" + +using namespace cute; + +namespace { + +template typename Epilogue> +struct sm100_fp8_config_default { + static_assert(std::is_same()); + using KernelSchedule = + cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; + using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; + using TileShape = cute::Shape; + using ClusterShape = cute::Shape; + using ArchTag = cutlass::arch::Sm100; + + using Cutlass3xGemm = + cutlass_3x_group_gemm; +}; + +template typename Epilogue> +struct sm100_fp8_config_M64 { + // M in [1,64] + static_assert(std::is_same()); + using KernelSchedule = + cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100; + using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; + using TileShape = cute::Shape; + using ClusterShape = cute::Shape; + using ArchTag = cutlass::arch::Sm100; + + using Cutlass3xGemm = + cutlass_3x_group_gemm; +}; + +template typename Epilogue> +struct sm100_fp8_config_N8192 { + // N in [8192, inf) + static_assert(std::is_same()); + using KernelSchedule = + cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100; + using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm; + using TileShape = cute::Shape; + using ClusterShape = cute::Shape; + using ArchTag = cutlass::arch::Sm100; + + using Cutlass3xGemm = + cutlass_3x_group_gemm; +}; + +template +void run_cutlass_moe_mm_sm100( + torch::Tensor& out_tensors, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, + torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, + torch::Tensor const& b_strides, torch::Tensor const& c_strides, + bool per_act_token, bool per_out_ch) { + TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided."); + TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided."); + TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided."); + + TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn, + "A tensors must be of type float8_e4m3fn."); + TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn, + "B tensors must be of type float8_e4m3fn."); + + using Cutlass3xGemmDefault = typename sm100_fp8_config_default< + InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm; + using Cutlass3xGemmN8192 = typename sm100_fp8_config_N8192< + InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm; + using Cutlass3xGemmM64 = typename sm100_fp8_config_M64< + InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm; + + uint32_t const m = a_tensors.size(0); + uint32_t const n = out_tensors.size(1); + + if (m <= 64) { + cutlass_group_gemm_caller( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, + problem_sizes, a_strides, b_strides, c_strides, per_act_token, + per_out_ch); + } else if (n >= 8192) { + cutlass_group_gemm_caller( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, + problem_sizes, a_strides, b_strides, c_strides, per_act_token, + per_out_ch); + } else { + cutlass_group_gemm_caller( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, + problem_sizes, a_strides, b_strides, c_strides, per_act_token, + per_out_ch); + } +} +} // namespace + +void dispatch_moe_mm_sm100( + torch::Tensor& out_tensors, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, + torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, + torch::Tensor const& b_strides, torch::Tensor const& c_strides, + bool per_act_token, bool per_out_ch) { + if (out_tensors.dtype() == torch::kBFloat16) { + run_cutlass_moe_mm_sm100( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, + problem_sizes, a_strides, b_strides, c_strides, per_act_token, + per_out_ch); + } else { + run_cutlass_moe_mm_sm100( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, + problem_sizes, a_strides, b_strides, c_strides, per_act_token, + per_out_ch); + } +} + +void cutlass_moe_mm_sm100( + torch::Tensor& out_tensors, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, + torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, + torch::Tensor const& b_strides, torch::Tensor const& c_strides, + bool per_act_token, bool per_out_ch) { + dispatch_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales, + expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, per_act_token, per_out_ch); +} diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu similarity index 88% rename from csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu rename to csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu index b024482208d3..8f21623b52fa 100644 --- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu @@ -21,10 +21,11 @@ struct sm90_fp8_config_default { cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong; using TileShape = cute::Shape; using ClusterShape = cute::Shape; + using ArchTag = cutlass::arch::Sm90; using Cutlass3xGemm = - cutlass_3x_group_gemm; + cutlass_3x_group_gemm; }; template ; using ClusterShape = cute::Shape; + using ArchTag = cutlass::arch::Sm90; using Cutlass3xGemm = - cutlass_3x_group_gemm; + cutlass_3x_group_gemm; }; template ; using ClusterShape = cute::Shape; + using ArchTag = cutlass::arch::Sm90; using Cutlass3xGemm = - cutlass_3x_group_gemm; + cutlass_3x_group_gemm; }; template ; using ClusterShape = cute::Shape; + using ArchTag = cutlass::arch::Sm90; using Cutlass3xGemm = - cutlass_3x_group_gemm; + cutlass_3x_group_gemm; }; template ; using ClusterShape = cute::Shape; + using ArchTag = cutlass::arch::Sm90; using Cutlass3xGemm = - cutlass_3x_group_gemm; + cutlass_3x_group_gemm; }; template @@ -112,9 +119,6 @@ void run_cutlass_moe_mm_sm90( TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn, "B tensors must be of type float8_e4m3fn."); - TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn); - TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn); - using Cutlass3xGemmN8192 = typename sm90_fp8_config_N8192< InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm; using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192< diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu index 623c9a2f096b..993c30c48c84 100644 --- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu +++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu @@ -190,4 +190,4 @@ void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets, static_cast(problem_sizes2.data_ptr()), static_cast(expert_num_tokens.data_ptr()), padded_m, n, k); -} +} \ No newline at end of file diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 31b60488dfb7..106bacb4883c 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -41,6 +41,16 @@ void cutlass_moe_mm_sm90( #endif +#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100 +void cutlass_moe_mm_sm100( + torch::Tensor& out_tensors, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, + torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, + torch::Tensor const& b_strides, torch::Tensor const& c_strides, + bool per_act_token, bool per_out_ch); +#endif + #if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120 void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, @@ -130,10 +140,10 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) { // and at least SM90 (Hopper) #if defined CUDA_VERSION - if (cuda_device_capability >= 90 && cuda_device_capability < 100) { - return CUDA_VERSION >= 12000; - } else if (cuda_device_capability >= 100) { + if (cuda_device_capability >= 100) { return CUDA_VERSION >= 12080; + } else if (cuda_device_capability >= 90) { + return CUDA_VERSION >= 12000; } #endif @@ -141,11 +151,14 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) { } bool cutlass_group_gemm_supported(int64_t cuda_device_capability) { - // CUTLASS grouped FP8 kernels need at least CUDA 12.3 - // and SM90 (Hopper) + // CUTLASS grouped FP8 kernels need at least CUDA 12.3 and SM90 (Hopper) + // or CUDA 12.8 and SM100 (Blackwell) #if defined CUDA_VERSION - if (cuda_device_capability == 90) { + if (cuda_device_capability >= 100) { + return CUDA_VERSION >= 12080; + } + if (cuda_device_capability >= 90) { return CUDA_VERSION >= 12030; } #endif @@ -234,16 +247,26 @@ void cutlass_moe_mm( torch::Tensor const& b_strides, torch::Tensor const& c_strides, bool per_act_token, bool per_out_ch) { int32_t version_num = get_sm_version_num(); +#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100 + if (version_num >= 100) { + cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales, + expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, per_act_token, per_out_ch); + return; + } +#endif #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90 - cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales, - expert_offsets, problem_sizes, a_strides, b_strides, - c_strides, per_act_token, per_out_ch); - return; + if (version_num >= 90) { + cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales, + expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, per_act_token, per_out_ch); + return; + } #endif TORCH_CHECK_NOT_IMPLEMENTED( false, "No compiled cutlass_scaled_mm for CUDA device capability: ", version_num, - ". Required capability: 90"); + ". Required capability: 90 or 100"); } void get_cutlass_moe_mm_data( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index e7f65d13181d..90b45e32a688 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -332,6 +332,12 @@ def _is_fp8_w8a8_sm90(self, weight_quant: BaseModel, return (self._check_scheme_supported(90, error=False, match_exact=True) and self._is_fp8_w8a8(weight_quant, input_quant)) + def _is_fp8_w8a8_sm100(self, weight_quant: BaseModel, + input_quant: BaseModel) -> bool: + return (self._check_scheme_supported( + 100, error=False, match_exact=True) + and self._is_fp8_w8a8(weight_quant, input_quant)) + def _is_fp8_w8a16(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: # Confirm weights quantized. diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 2c93977beede..7da52ce6ff8c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -83,7 +83,8 @@ def get_moe_method( return CompressedTensorsWNA16MarlinMoEMethod(quant_config) elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): return CompressedTensorsW4A4MoeMethod() - elif quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant): + elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant) + or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)): return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config) elif quant_config._is_fp8_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8Fp8MoEMethod(quant_config) @@ -740,6 +741,8 @@ def __init__( self.topk_indices_dtype = None self.fused_experts = None # type: ignore self.disable_expert_map = False + self.is_fp8_w8a8_sm100 = self.quant_config._is_fp8_w8a8_sm100( + self.weight_quant, self.input_quant) def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -931,7 +934,29 @@ def apply( per_act_token = ( self.input_quant.strategy == QuantizationStrategy.TOKEN) - + per_channel_quant = ( + self.weight_quant.strategy == QuantizationStrategy.CHANNEL) + # Triton fused_experts is faster in small batch sizes on SM100. + # Fall back to fused_experts in small batch sizes. + if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8: + from vllm.model_executor.layers.fused_moe import fused_experts + return fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights, + topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + use_fp8_w8a8=True, + per_channel_quant=per_channel_quant, + global_num_experts=global_num_experts, + expert_map=None if self.disable_expert_map else expert_map, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale) if self.fused_experts is None: # If no modular kernel is provided, use cutlass_moe_fp8 from vllm.model_executor.layers.fused_moe.cutlass_moe import ( From a7cae7c69be2298c4b01eff3f1fa590b5c0fc498 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 22 Jul 2025 10:27:15 -0400 Subject: [PATCH 28/63] [Perf] Cuda Kernel for Per Token Group Quant (#21083) Signed-off-by: yewentao256 Signed-off-by: qizixi --- CMakeLists.txt | 1 + csrc/ops.h | 5 + .../quantization/fp8/per_token_group_quant.cu | 213 ++++++++++++++++++ csrc/torch_bindings.cpp | 9 + .../test_per_token_group_quant.py | 44 ++++ .../layers/quantization/utils/fp8_utils.py | 17 +- 6 files changed, 285 insertions(+), 4 deletions(-) create mode 100644 csrc/quantization/fp8/per_token_group_quant.cu create mode 100644 tests/kernels/quantization/test_per_token_group_quant.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 10f8667db649..767e9ad7541b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -245,6 +245,7 @@ set(VLLM_EXT_SRC "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/fp8/common.cu" + "csrc/quantization/fp8/per_token_group_quant.cu" "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/activation_kernels.cu" diff --git a/csrc/ops.h b/csrc/ops.h index 7f3e6b6923a3..fdd3071c56ef 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -297,6 +297,11 @@ void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scales, std::optional const& azp); +void per_token_group_quant_fp8(const torch::Tensor& input, + torch::Tensor& output_q, torch::Tensor& output_s, + int64_t group_size, double eps, double fp8_min, + double fp8_max, bool scale_ue8m0); + torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight, torch::Tensor b_gptq_qzeros, torch::Tensor b_gptq_scales, torch::Tensor b_g_idx, diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/fp8/per_token_group_quant.cu new file mode 100644 index 000000000000..afc41faeca90 --- /dev/null +++ b/csrc/quantization/fp8/per_token_group_quant.cu @@ -0,0 +1,213 @@ +#include +#include + +#include + +#include +#include + +#include + +#include "../vectorization.cuh" +#include "../vectorization_utils.cuh" +#include "../../dispatch_utils.h" + +__device__ __forceinline__ float GroupReduceMax(float val, const int tid) { + unsigned mask = 0xffff; + + val = fmaxf(val, __shfl_xor_sync(mask, val, 8)); + val = fmaxf(val, __shfl_xor_sync(mask, val, 4)); + val = fmaxf(val, __shfl_xor_sync(mask, val, 2)); + val = fmaxf(val, __shfl_xor_sync(mask, val, 1)); + return val; +} + +template +__global__ void per_token_group_quant_8bit_kernel( + const T* __restrict__ input, void* __restrict__ output_q, + scale_packed_t* __restrict__ output_s, const int group_size, + const int num_groups, const int groups_per_block, const float eps, + const float min_8bit, const float max_8bit, const int scale_num_rows = 0, + const int scale_stride = 0) { + const int threads_per_group = 16; + const int64_t local_group_id = threadIdx.x / threads_per_group; + const int lane_id = threadIdx.x % threads_per_group; + + const int64_t block_group_id = blockIdx.x * groups_per_block; + const int64_t global_group_id = block_group_id + local_group_id; + const int64_t block_group_offset = global_group_id * group_size; + + float local_absmax = eps; + + using scale_element_t = float; + static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0); + + const T* group_input = input + block_group_offset; + DST_DTYPE* group_output = + static_cast(output_q) + block_group_offset; + scale_element_t* scale_output; + + if constexpr (IS_COLUMN_MAJOR) { + const int num_elems_per_pack = + static_cast(sizeof(scale_packed_t) / sizeof(scale_element_t)); + const int scale_num_rows_element = scale_num_rows * num_elems_per_pack; + const int row_idx = global_group_id / scale_num_rows_element; + const int col_idx_raw = global_group_id % scale_num_rows_element; + const int col_idx = col_idx_raw / num_elems_per_pack; + const int pack_idx = col_idx_raw % num_elems_per_pack; + scale_output = reinterpret_cast(output_s) + + (col_idx * scale_stride * num_elems_per_pack + + row_idx * num_elems_per_pack + pack_idx); + } else { + scale_output = output_s + global_group_id; + } + + // shared memory to cache each group's data to avoid double DRAM reads. + extern __shared__ __align__(16) char smem_raw[]; + T* smem = reinterpret_cast(smem_raw); + T* smem_group = smem + local_group_id * group_size; + + constexpr int vec_size = 16 / sizeof(T); + using vec_t = vllm::vec_n_t; + + // copy global -> shared & compute absmax + auto scalar_op_cache = [&] __device__(T & dst, const T& src) { + float abs_v = fabsf(static_cast(src)); + local_absmax = fmaxf(local_absmax, abs_v); + dst = src; + }; + + vllm::vectorize_with_alignment( + group_input, // in + smem_group, // out (shared) + group_size, // elements per group + lane_id, // thread id + threads_per_group, // stride in group + scalar_op_cache); // scalar handler + + local_absmax = GroupReduceMax(local_absmax, lane_id); + + float y_s = local_absmax / max_8bit; + if constexpr (SCALE_UE8M0) { + y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f)))); + } + + scale_element_t y_s_quant = y_s; + + if (lane_id == 0) { + *scale_output = y_s_quant; + } + + __syncthreads(); + + // quantize shared -> global 8-bit + auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) { + float q = fminf(fmaxf(static_cast(src) / y_s, min_8bit), max_8bit); + dst = DST_DTYPE(q); + }; + + vllm::vectorize_with_alignment( + smem_group, // in (shared) + group_output, // out (global quant tensor) + group_size, // elements + lane_id, // tid + threads_per_group, // stride + scalar_op_quant); // scalar handler +} + +void per_token_group_quant_8bit(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s, int64_t group_size, + double eps, double min_8bit, double max_8bit, + bool scale_ue8m0 = false) { + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(output_q.is_contiguous()); + + const int num_groups = input.numel() / group_size; + + TORCH_CHECK(input.numel() % group_size == 0); + TORCH_CHECK(output_s.dim() == 2); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + constexpr int THREADS_PER_GROUP = 16; + + int groups_per_block = 1; + + if (num_groups % 16 == 0) { + groups_per_block = 16; + } else if (num_groups % 8 == 0) { + groups_per_block = 8; + } else if (num_groups % 4 == 0) { + groups_per_block = 4; + } else if (num_groups % 2 == 0) { + groups_per_block = 2; + } + + auto dst_type = output_q.scalar_type(); + const int num_blocks = num_groups / groups_per_block; + const int num_threads = groups_per_block * THREADS_PER_GROUP; + + const bool is_column_major = output_s.stride(0) < output_s.stride(1); + const int scale_num_rows = output_s.size(1); + const int scale_stride = output_s.stride(1); + +#define LAUNCH_KERNEL(T, DST_DTYPE) \ + do { \ + dim3 grid(num_blocks); \ + dim3 block(num_threads); \ + size_t smem_bytes = \ + static_cast(groups_per_block) * group_size * sizeof(T); \ + if (is_column_major) { \ + if (scale_ue8m0) { \ + per_token_group_quant_8bit_kernel \ + <<>>( \ + static_cast(input.data_ptr()), output_q.data_ptr(), \ + static_cast(output_s.data_ptr()), group_size, \ + num_groups, groups_per_block, (float)eps, (float)min_8bit, \ + (float)max_8bit, scale_num_rows, scale_stride); \ + } else { \ + per_token_group_quant_8bit_kernel \ + <<>>( \ + static_cast(input.data_ptr()), output_q.data_ptr(), \ + static_cast(output_s.data_ptr()), group_size, \ + num_groups, groups_per_block, (float)eps, (float)min_8bit, \ + (float)max_8bit, scale_num_rows, scale_stride); \ + } \ + } else { \ + if (scale_ue8m0) { \ + per_token_group_quant_8bit_kernel \ + <<>>( \ + static_cast(input.data_ptr()), output_q.data_ptr(), \ + static_cast(output_s.data_ptr()), group_size, \ + num_groups, groups_per_block, (float)eps, (float)min_8bit, \ + (float)max_8bit); \ + } else { \ + per_token_group_quant_8bit_kernel \ + <<>>( \ + static_cast(input.data_ptr()), output_q.data_ptr(), \ + static_cast(output_s.data_ptr()), group_size, \ + num_groups, groups_per_block, (float)eps, (float)min_8bit, \ + (float)max_8bit); \ + } \ + } \ + } while (0) + + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "per_token_group_quant_8bit", ([&] { + if (dst_type == at::ScalarType::Float8_e4m3fn) { + LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn); + } + })); + +#undef LAUNCH_KERNEL +} + +void per_token_group_quant_fp8(const torch::Tensor& input, + torch::Tensor& output_q, torch::Tensor& output_s, + int64_t group_size, double eps, double fp8_min, + double fp8_max, bool scale_ue8m0) { + per_token_group_quant_8bit(input, output_q, output_s, group_size, eps, + fp8_min, fp8_max, scale_ue8m0); +} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 79e2575974b5..d310211afe43 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -601,6 +601,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("dynamic_scaled_int8_quant", torch::kCUDA, &dynamic_scaled_int8_quant); + // Compute per-token-group FP8 quantized tensor and scaling factor. + ops.def( + "per_token_group_fp8_quant(Tensor input, Tensor! output_q, Tensor! " + "output_s, " + "int group_size, float eps, float fp8_min, float fp8_max, bool " + "scale_ue8m0) -> ()"); + ops.impl("per_token_group_fp8_quant", torch::kCUDA, + &per_token_group_quant_fp8); + // Mamba selective scan kernel ops.def( "selective_scan_fwd(Tensor! u, Tensor! delta," diff --git a/tests/kernels/quantization/test_per_token_group_quant.py b/tests/kernels/quantization/test_per_token_group_quant.py new file mode 100644 index 000000000000..f826983fe94e --- /dev/null +++ b/tests/kernels/quantization/test_per_token_group_quant.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from unittest.mock import patch + +import pytest +import torch + +from vllm.model_executor.layers.quantization.utils import fp8_utils + + +@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)]) +@pytest.mark.parametrize("column_major", [False, True]) +@pytest.mark.parametrize("scale_ue8m0", [False, True]) +@pytest.mark.parametrize("group_size", [64, 128]) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_per_token_group_quant_fp8(shape, column_major: bool, + scale_ue8m0: bool, group_size: int): + device = "cuda" + + torch.manual_seed(42) + num_tokens, hidden_dim = shape + + x = (torch.randn( + (num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8) + + # cuda path + out_q, scale = fp8_utils.per_token_group_quant_fp8( + x, + group_size, + column_major_scales=column_major, + use_ue8m0=scale_ue8m0, + ) + + # triton ref + with patch("vllm.platforms.current_platform.is_cuda", return_value=False): + ref_q, ref_s = fp8_utils.per_token_group_quant_fp8( + x, + group_size, + column_major_scales=column_major, + use_ue8m0=scale_ue8m0, + ) + + assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15) + assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 20e7b444856e..ee5f2b51564d 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -366,6 +366,7 @@ def per_token_group_quant_fp8( dtype: Optional[torch.dtype] = None, column_major_scales: bool = False, out_q: Optional[torch.Tensor] = None, + use_ue8m0: bool = is_blackwell_deep_gemm_used(), ) -> tuple[torch.Tensor, torch.Tensor]: """Function to perform per-token-group quantization on an input tensor `x`. It converts the tensor values into signed float8 values and returns the @@ -397,8 +398,7 @@ def per_token_group_quant_fp8( if x_q is None: x_q = torch.empty_like(x, device=x.device, dtype=dtype) - M = x.numel() // group_size - N = group_size + # Allocate the scale tensor in either row- or column-major format. if column_major_scales: shape = (x.shape[-1] // group_size, ) + x.shape[:-1] x_s = torch.empty(shape, device=x.device, @@ -407,6 +407,15 @@ def per_token_group_quant_fp8( shape = x.shape[:-1] + (x.shape[-1] // group_size, ) x_s = torch.empty(shape, device=x.device, dtype=torch.float32) + # prefer CUDA kernel if available + if current_platform.is_cuda() and x.is_contiguous(): + torch.ops._C.per_token_group_fp8_quant(x, x_q, x_s, group_size, eps, + fp8_min, fp8_max, use_ue8m0) + return x_q, x_s + + # TRITON FALLBACK + M = x.numel() // group_size + N = group_size BLOCK = triton.next_power_of_2(N) # heuristics for number of warps num_warps = min(max(BLOCK // 256, 1), 8) @@ -423,7 +432,7 @@ def per_token_group_quant_fp8( eps, fp8_min=fp8_min, fp8_max=fp8_max, - use_ue8m0=is_blackwell_deep_gemm_used(), + use_ue8m0=use_ue8m0, BLOCK=BLOCK, num_warps=num_warps, num_stages=num_stages, @@ -439,7 +448,7 @@ def per_token_group_quant_fp8( eps, fp8_min=fp8_min, fp8_max=fp8_max, - use_ue8m0=is_blackwell_deep_gemm_used(), + use_ue8m0=use_ue8m0, BLOCK=BLOCK, num_warps=num_warps, num_stages=num_stages, From db98d04ae3a18678628ad94996e6a7fc9320acc4 Mon Sep 17 00:00:00 2001 From: Benjamin Bartels Date: Tue, 22 Jul 2025 16:15:53 +0100 Subject: [PATCH 29/63] Adds parallel model weight loading for runai_streamer (#21330) Signed-off-by: bbartels Co-authored-by: Cyrus Leung Signed-off-by: qizixi --- setup.py | 3 ++- .../model_loader/weight_utils.py | 22 ++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 9a5ca3456a0f..d46e678e7aa4 100644 --- a/setup.py +++ b/setup.py @@ -659,7 +659,8 @@ def _read_requirements(filename: str) -> list[str]: "bench": ["pandas", "datasets"], "tensorizer": ["tensorizer==2.10.1"], "fastsafetensors": ["fastsafetensors >= 0.1.10"], - "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"], + "runai": + ["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"], "audio": ["librosa", "soundfile", "mistral_common[audio]"], # Required for audio processing "video": [] # Kept for backwards compatibility diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 64a2089921ee..074126fa669e 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -482,14 +482,20 @@ def runai_safetensors_weights_iterator( ) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" with SafetensorsStreamer() as streamer: - for st_file in tqdm( - hf_weights_files, - desc="Loading safetensors using Runai Model Streamer", - disable=not enable_tqdm(use_tqdm_on_load), - bar_format=_BAR_FORMAT, - ): - streamer.stream_file(st_file) - yield from streamer.get_tensors() + streamer.stream_files(hf_weights_files) + total_tensors = sum( + len(tensors_meta) + for tensors_meta in streamer.files_to_tensors_metadata.values()) + + tensor_iter = tqdm( + streamer.get_tensors(), + total=total_tensors, + desc="Loading safetensors using Runai Model Streamer", + bar_format=_BAR_FORMAT, + disable=not enable_tqdm(use_tqdm_on_load), + ) + + yield from tensor_iter def fastsafetensors_weights_iterator( From 6666593b7688854b8a50e8d22ffd197eb260bc39 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 22 Jul 2025 17:18:46 +0200 Subject: [PATCH 30/63] [feat] Enable mm caching for transformers backend (#21358) Signed-off-by: raushan Signed-off-by: qizixi --- docs/models/supported_models.md | 2 +- tests/models/multimodal/generation/test_common.py | 8 -------- vllm/model_executor/models/transformers.py | 9 +++------ vllm/v1/core/kv_cache_utils.py | 6 +++--- 4 files changed, 7 insertions(+), 18 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 13ebb03e787e..bbb52f035c72 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -18,7 +18,7 @@ These models are what we list in [supported-text-models][supported-text-models] ### Transformers -vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs, and require setting `--disable_mm_preprocessor_cache` when running. Support for video inputs and caching of multi-modal preprocessors will be added in future releases. +vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs. Support for video inputs will be added in future releases. To check if the modeling backend is Transformers, you can simply do this: diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 9859ac5a89dd..e2e35e9b2721 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -186,8 +186,6 @@ image_size_factors=[(0.25, 0.5, 1.0)], vllm_runner_kwargs={ "model_impl": "transformers", - "disable_mm_preprocessor_cache": True, - "enable_prefix_caching": False, }, marks=[pytest.mark.core_model], ), @@ -205,8 +203,6 @@ # image_size_factors=[(0.25, 0.5, 1.0)], # vllm_runner_kwargs={ # "model_impl": "transformers", - # "disable_mm_preprocessor_cache": True, - # "enable_prefix_caching": False, # }, # marks=[pytest.mark.core_model], # ), @@ -223,8 +219,6 @@ image_size_factors=[(0.25, 0.2, 0.15)], vllm_runner_kwargs={ "model_impl": "transformers", - "disable_mm_preprocessor_cache": True, - "enable_prefix_caching": False, }, marks=[large_gpu_mark(min_gb=32)], ), @@ -239,8 +233,6 @@ image_size_factors=[(0.25, 0.5, 1.0)], vllm_runner_kwargs={ "model_impl": "auto", - "disable_mm_preprocessor_cache": True, - "enable_prefix_caching": False, }, auto_cls=AutoModelForImageTextToText, marks=[pytest.mark.core_model], diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 47cff29caab0..eea03afcd8a7 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -315,11 +315,6 @@ def apply( Apply HF Processor on prompt text and multi-modal data together, outputting token IDs and processed tensors. """ - if return_mm_hashes: - raise ValueError( - "TransformersForMultimodalLM doesn't support mm hashing yet! " - "Probably you didn't set `disable_mm_preprocessor_cache=True`") - if tokenization_kwargs is None: tokenization_kwargs = {} @@ -375,12 +370,14 @@ def apply( num_image_patches), ) + mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs, + tokenization_kwargs) return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, - mm_hashes=None, + mm_hashes=mm_hashes, mm_placeholders=mm_placeholders, ) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 198d79cfb420..5b0218640a8c 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -406,9 +406,9 @@ def need_extra_keys(request: Request) -> bool: # Multimodal requests need to include the MM hash. # LoRA requests need to include the LoRA ID. # Request with provided cache salt need to include the salt. - return bool(request.mm_positions) or (request.lora_request - is not None) or (request.cache_salt - is not None) + return bool(request.mm_hashes) or (request.lora_request + is not None) or (request.cache_salt + is not None) def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, From 3eb125c879589d872928c6d4e735352638dfd2d8 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 22 Jul 2025 11:22:10 -0400 Subject: [PATCH 31/63] Revert "[Refactor] Fix Compile Warning #1444-D (#21208)" (#21384) Signed-off-by: yewentao256 Signed-off-by: qizixi --- csrc/moe/topk_softmax_kernels.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index ea4ff67ef3e4..064b76c9cd42 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -20,7 +20,6 @@ #include #include #include "../cuda_compat.h" -#include #ifndef USE_ROCM #include @@ -63,7 +62,7 @@ __launch_bounds__(TPB) __global__ const int thread_row_offset = blockIdx.x * num_cols; - cuda::std::plus sum; + cub::Sum sum; float threadData(-FLT_MAX); // Don't touch finished rows. From 672837705813e6f2d7ba80633f693357bf8a0032 Mon Sep 17 00:00:00 2001 From: Wang Yijun Date: Tue, 22 Jul 2025 23:24:00 +0800 Subject: [PATCH 32/63] Add tokenization_kwargs to encode for embedding model truncation (#21033) Signed-off-by: qizixi --- vllm/engine/async_llm_engine.py | 6 ++++++ vllm/entrypoints/llm.py | 15 ++++++++++++--- vllm/v1/engine/async_llm.py | 2 ++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 3d7d28055dd0..06ae2a2f18f2 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -438,6 +438,7 @@ async def add_request_async( prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> None: """ Async version of @@ -468,6 +469,7 @@ async def add_request_async( prompt, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, + tokenization_kwargs=tokenization_kwargs, ) if isinstance(params, SamplingParams) and \ @@ -862,6 +864,7 @@ async def add_request( prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: if not self.is_running: if self.start_engine_loop: @@ -889,6 +892,7 @@ async def add_request( prompt_adapter_request=prompt_adapter_request, priority=priority, data_parallel_rank=data_parallel_rank, + tokenization_kwargs=tokenization_kwargs, ) return stream.generator() @@ -996,6 +1000,7 @@ async def encode( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: """Generate outputs for a request from a pooling model. @@ -1070,6 +1075,7 @@ async def encode( lora_request=lora_request, trace_headers=trace_headers, priority=priority, + tokenization_kwargs=tokenization_kwargs, ): yield LLMEngine.validate_output(output, PoolingRequestOutput) except asyncio.CancelledError: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 78f9d32d811d..c4f1b3b86619 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -965,6 +965,7 @@ def encode( lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, pooling_task: PoolingTask = "encode", + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[PoolingRequestOutput]: ... @@ -981,6 +982,7 @@ def encode( lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, pooling_task: PoolingTask = "encode", + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[PoolingRequestOutput]: ... @@ -997,6 +999,7 @@ def encode( lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, pooling_task: PoolingTask = "encode", + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[PoolingRequestOutput]: ... @@ -1014,6 +1017,7 @@ def encode( lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, pooling_task: PoolingTask = "encode", + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[PoolingRequestOutput]: ... @@ -1031,6 +1035,7 @@ def encode( lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, pooling_task: PoolingTask = "encode", + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[PoolingRequestOutput]: ... @@ -1046,6 +1051,7 @@ def encode( lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, pooling_task: PoolingTask = "encode", + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[PoolingRequestOutput]: ... @@ -1066,6 +1072,7 @@ def encode( lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, pooling_task: PoolingTask = "encode", + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[PoolingRequestOutput]: """Apply pooling to the hidden states corresponding to the input prompts. @@ -1131,9 +1138,11 @@ def encode( for pooling_param in pooling_params: pooling_param.verify(pooling_task, model_config) - tokenization_kwargs = dict[str, Any]() - _validate_truncation_size(model_config.max_model_len, - truncate_prompt_tokens, tokenization_kwargs) + if tokenization_kwargs is None: + tokenization_kwargs = dict[str, Any]() + _validate_truncation_size(model_config.max_model_len, + truncate_prompt_tokens, + tokenization_kwargs) self._validate_and_add_requests( prompts=parsed_prompts, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b8ba36f3502f..79b5d5ae4a23 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -437,6 +437,7 @@ async def encode( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: """ Main function called by the API server to kick off a request @@ -465,6 +466,7 @@ async def encode( lora_request=lora_request, trace_headers=trace_headers, priority=priority, + tokenization_kwargs=tokenization_kwargs, ) # The output_handler task pushes items into the queue. From 5aafc16f1017d79d7ddafbbe8256dfad0d67c4d2 Mon Sep 17 00:00:00 2001 From: Aritra Roy Gosthipaty Date: Tue, 22 Jul 2025 20:57:28 +0530 Subject: [PATCH 33/63] [Bugfix] Decode Tokenized IDs to Strings for `hf_processor` in `llm.chat()` with `model_impl=transformers` (#21353) Signed-off-by: ariG23498 Signed-off-by: qizixi --- .../processing/test_transformers.py | 40 +++++++++++++++++++ vllm/model_executor/models/transformers.py | 5 +++ 2 files changed, 45 insertions(+) create mode 100644 tests/models/multimodal/processing/test_transformers.py diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py new file mode 100644 index 000000000000..c7d1b5271ff7 --- /dev/null +++ b/tests/models/multimodal/processing/test_transformers.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from vllm.assets.image import ImageAsset +from vllm.config import ModelConfig +from vllm.multimodal import MULTIMODAL_REGISTRY + + +# yapf: disable +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +def test_multimodal_processor(model_id): + model_config = ModelConfig( + model=model_id, + model_impl="transformers", + ) + + mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, ) + + image_pil = ImageAsset('cherry_blossom').pil_image + mm_data = {"image": image_pil} + str_prompt = "<|im_start|>user \nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501 + str_processed_inputs = mm_processor.apply( + prompt=str_prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + ids_prompt = [ + 151644, 872, 220, 151646, 198, 3838, 374, 279, 2213, 315, 419, 2168, + 30, 151645, 151644, 77091, 198 + ] + ids_processed_inputs = mm_processor.apply( + prompt=ids_prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert str_processed_inputs["prompt"] == ids_processed_inputs["prompt"] diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index eea03afcd8a7..cb9d28b10672 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -320,6 +320,11 @@ def apply( mm_items = self._to_mm_items(mm_data) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + if not isinstance(prompt, str): + # the prompt is the tokenized ids which is not supported + # by the hf_processor, which is why we would need to decode the ids + # into string + prompt = hf_processor.decode(prompt) (prompt_ids, processed_data, mm_token_type_ids) = self._apply_hf_processor_text_mm( From 8fcfe3698e944008e2d67f82da6e8e55d50094e8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 22 Jul 2025 23:39:35 +0800 Subject: [PATCH 34/63] [CI/Build] Fix test failure due to updated model repo (#21375) Signed-off-by: DarkLight1337 Signed-off-by: qizixi --- tests/models/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 8e3285aebbe7..776b4c033564 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -167,9 +167,9 @@ def check_available_online( "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3", # noqa: E501 trust_remote_code=True), "Ernie4_5_ForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-0.3B-PT", - trust_remote_code=True), + min_transformers_version="4.54"), "Ernie4_5_MoeForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT", - trust_remote_code=True), + min_transformers_version="4.54"), "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"), # noqa: E501 "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 From faf8b1a898b9992d1816c0545d4021356cfb55f5 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 22 Jul 2025 15:42:31 -0400 Subject: [PATCH 35/63] Fix Flashinfer Allreduce+Norm enable disable calculation based on `fi_allreduce_fusion_max_token_num` (#21325) Signed-off-by: XIn Li Signed-off-by: qizixi --- vllm/compilation/collective_fusion.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index a8b00aaf0842..0e7961841bd3 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -159,6 +159,9 @@ def __call__(self, graph: fx.Graph): 6: MiB // 2, # 512KB 8: MiB // 2, # 512KB } + # opt for a more conservative default value + # when world size is not in _FI_MAX_SIZES + _DEFAULT_FI_MAX_SIZE = MiB // 2 def call_trtllm_fused_allreduce_norm( allreduce_in: torch.Tensor, @@ -173,12 +176,16 @@ def call_trtllm_fused_allreduce_norm( max_token_num: int, norm_out: Optional[torch.Tensor] = None, ) -> None: - use_flashinfer = allreduce_in.shape[0] * allreduce_in.shape[ - 1] * allreduce_in.element_size() <= min( - _FI_MAX_SIZES[world_size], - max_token_num * allreduce_in.shape[0] * - allreduce_in.element_size(), - ) + + num_tokens, hidden_size = allreduce_in.shape + element_size = allreduce_in.element_size() + current_tensor_size = num_tokens * hidden_size * element_size + max_fusion_size = max_token_num * hidden_size * element_size + use_flashinfer = current_tensor_size <= min( + _FI_MAX_SIZES.get(world_size, _DEFAULT_FI_MAX_SIZE), + max_fusion_size, + ) + if use_flashinfer: assert (_FI_WORKSPACE_TENSOR is not None ), "Flashinfer must be enabled when using flashinfer" From b3dead94cbfaab30a6e15fdf31e41b053994e031 Mon Sep 17 00:00:00 2001 From: Yiheng Xu Date: Tue, 22 Jul 2025 15:05:57 -0700 Subject: [PATCH 36/63] [Model] Add Qwen3CoderToolParser (#21396) Signed-off-by: simon-mo Co-authored-by: simon-mo Signed-off-by: qizixi --- tests/tool_use/test_qwen3coder_tool_parser.py | 618 ++++++++++++++++ .../openai/tool_parsers/__init__.py | 2 + .../tool_parsers/qwen3coder_tool_parser.py | 669 ++++++++++++++++++ 3 files changed, 1289 insertions(+) create mode 100644 tests/tool_use/test_qwen3coder_tool_parser.py create mode 100644 vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py new file mode 100644 index 000000000000..40c3158e9e68 --- /dev/null +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -0,0 +1,618 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Generator +from typing import Optional + +import pytest + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + ChatCompletionToolsParam, + DeltaMessage, FunctionCall, + ToolCall) +from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import ( + Qwen3CoderToolParser) +from vllm.transformers_utils.detokenizer import detokenize_incrementally +from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer + +MODEL = "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8" + + +@pytest.fixture(scope="module") +def qwen3_tokenizer(): + return get_tokenizer(tokenizer_name=MODEL) + + +@pytest.fixture +def qwen3_tool_parser(qwen3_tokenizer): + return Qwen3CoderToolParser(qwen3_tokenizer) + + +@pytest.fixture +def sample_tools(): + return [ + ChatCompletionToolsParam(type="function", + function={ + "name": "get_current_weather", + "description": "Get the current weather", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city name" + }, + "state": { + "type": "string", + "description": + "The state code" + }, + "unit": { + "type": "string", + "enum": + ["fahrenheit", "celsius"] + } + }, + "required": ["city", "state"] + } + }), + ChatCompletionToolsParam(type="function", + function={ + "name": "calculate_area", + "description": + "Calculate area of a shape", + "parameters": { + "type": "object", + "properties": { + "shape": { + "type": "string" + }, + "dimensions": { + "type": "object" + }, + "precision": { + "type": "integer" + } + } + } + }) + ] + + +def assert_tool_calls(actual_tool_calls: list[ToolCall], + expected_tool_calls: list[ToolCall]): + assert len(actual_tool_calls) == len(expected_tool_calls) + + for actual_tool_call, expected_tool_call in zip(actual_tool_calls, + expected_tool_calls): + # Qwen3 parser doesn't generate IDs during extraction + assert actual_tool_call.type == "function" + assert ( + actual_tool_call.function.name == expected_tool_call.function.name) + assert (json.loads(actual_tool_call.function.arguments) == json.loads( + expected_tool_call.function.arguments)) + + +def stream_delta_message_generator( + qwen3_tool_parser: Qwen3CoderToolParser, + qwen3_tokenizer: AnyTokenizer, + model_output: str, + request: Optional[ChatCompletionRequest] = None +) -> Generator[DeltaMessage, None, None]: + all_token_ids = qwen3_tokenizer.encode(model_output, + add_special_tokens=False) + + previous_text = "" + previous_tokens = None + prefix_offset = 0 + read_offset = 0 + for i, delta_token in enumerate(all_token_ids): + delta_token_ids = [delta_token] + previous_token_ids = all_token_ids[:i] + current_token_ids = all_token_ids[:i + 1] + + (new_tokens, delta_text, new_prefix_offset, + new_read_offset) = detokenize_incrementally( + tokenizer=qwen3_tokenizer, + all_input_ids=current_token_ids, + prev_tokens=previous_tokens, + prefix_offset=prefix_offset, + read_offset=read_offset, + skip_special_tokens=False, + spaces_between_special_tokens=True, + ) + + current_text = previous_text + delta_text + + delta_message = qwen3_tool_parser.extract_tool_calls_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + delta_token_ids, + request=request, + ) + if delta_message: + yield delta_message + + previous_text = current_text + previous_tokens = (previous_tokens + + new_tokens if previous_tokens else new_tokens) + prefix_offset = new_prefix_offset + read_offset = new_read_offset + + +def test_extract_tool_calls_no_tools(qwen3_tool_parser): + model_output = "This is a test response without any tool calls" + extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + assert not extracted_tool_calls.tools_called + assert extracted_tool_calls.tool_calls == [] + assert extracted_tool_calls.content == model_output + + +@pytest.mark.parametrize( + ids=[ + "single_tool", + "single_tool_with_content", + "single_tool_multiline_param", + "parallel_tools", + "tool_with_typed_params", + ], + argnames=["model_output", "expected_tool_calls", "expected_content"], + argvalues=[ + (''' + + +Dallas + + +TX + + +fahrenheit + + +''', [ + ToolCall( + function=FunctionCall(name="get_current_weather", + arguments=json.dumps({ + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))) + ], None), + ('''Sure! Let me check the weather for you. + + +Dallas + + +TX + + +fahrenheit + + +''', [ + ToolCall( + function=FunctionCall(name="get_current_weather", + arguments=json.dumps({ + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))) + ], "Sure! Let me check the weather for you."), + (''' + + +rectangle + + +{"width": 10, + "height": 20} + + +2 + + +''', [ + ToolCall(function=FunctionCall(name="calculate_area", + arguments=json.dumps({ + "shape": "rectangle", + "dimensions": { + "width": 10, + "height": 20 + }, + "precision": 2 + }))) + ], None), + (''' + + +Dallas + + +TX + + +fahrenheit + + + + + + +Orlando + + +FL + + +fahrenheit + + +''', [ + ToolCall( + function=FunctionCall(name="get_current_weather", + arguments=json.dumps({ + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))), + ToolCall( + function=FunctionCall(name="get_current_weather", + arguments=json.dumps({ + "city": "Orlando", + "state": "FL", + "unit": "fahrenheit" + }))) + ], None), + ('''Let me calculate that area for you. + + +circle + + +{"radius": 15.5} + + +3 + + +''', [ + ToolCall(function=FunctionCall(name="calculate_area", + arguments=json.dumps({ + "shape": "circle", + "dimensions": { + "radius": 15.5 + }, + "precision": 3 + }))) + ], "Let me calculate that area for you."), + ], +) +def test_extract_tool_calls(qwen3_tool_parser, sample_tools, model_output, + expected_tool_calls, expected_content): + request = ChatCompletionRequest(model=MODEL, + messages=[], + tools=sample_tools) + extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + model_output, request=request) + assert extracted_tool_calls.tools_called + + assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls) + + assert extracted_tool_calls.content == expected_content + + +def test_extract_tool_calls_fallback_no_tags(qwen3_tool_parser, sample_tools): + """Test fallback parsing when XML tags are missing""" + model_output = ''' + +Dallas + + +TX + +''' + + request = ChatCompletionRequest(model=MODEL, + messages=[], + tools=sample_tools) + extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + model_output, request=request) + + assert extracted_tool_calls.tools_called + assert len(extracted_tool_calls.tool_calls) == 1 + assert (extracted_tool_calls.tool_calls[0].function.name == + "get_current_weather") + + +def test_extract_tool_calls_type_conversion(qwen3_tool_parser): + """Test parameter type conversion based on tool schema""" + tools = [ + ChatCompletionToolsParam(type="function", + function={ + "name": "test_types", + "parameters": { + "type": "object", + "properties": { + "int_param": { + "type": "integer" + }, + "float_param": { + "type": "float" + }, + "bool_param": { + "type": "boolean" + }, + "str_param": { + "type": "string" + }, + "obj_param": { + "type": "object" + } + } + } + }) + ] + + model_output = ''' + + +42 + + +3.14 + + +true + + +hello world + + +{"key": "value"} + + +''' + + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + model_output, request=request) + + args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) + assert args["int_param"] == 42 + assert args["float_param"] == 3.14 + assert args["bool_param"] is True + assert args["str_param"] == "hello world" + assert args["obj_param"] == {"key": "value"} + + +@pytest.mark.parametrize( + ids=[ + "no_tools", + "single_tool", + "single_tool_with_content", + "parallel_tools", + ], + argnames=["model_output", "expected_tool_calls", "expected_content"], + argvalues=[ + ("This is a test without tools", [], "This is a test without tools"), + (''' + + +Dallas + + +TX + + +fahrenheit + + +''', [ + ToolCall( + function=FunctionCall(name="get_current_weather", + arguments=json.dumps({ + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))) + ], ""), + ('''Sure! Let me check the weather for you. + + +Dallas + + +TX + + +fahrenheit + + +''', [ + ToolCall( + function=FunctionCall(name="get_current_weather", + arguments=json.dumps({ + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))) + ], "Sure! Let me check the weather for you."), + (''' + + +Dallas + + +TX + + +fahrenheit + + + + + + +Orlando + + +FL + + +celsius + + +''', [ + ToolCall( + function=FunctionCall(name="get_current_weather", + arguments=json.dumps({ + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))), + ToolCall( + function=FunctionCall(name="get_current_weather", + arguments=json.dumps({ + "city": "Orlando", + "state": "FL", + "unit": "celsius" + }))) + ], ""), + ], +) +def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer, + sample_tools, model_output, + expected_tool_calls, expected_content): + """Test incremental streaming behavior""" + request = ChatCompletionRequest(model=MODEL, + messages=[], + tools=sample_tools) + + other_content = '' + tool_states = {} # Track state per tool index + + for delta_message in stream_delta_message_generator( + qwen3_tool_parser, qwen3_tokenizer, model_output, request): + # role should never be streamed from tool parser + assert not delta_message.role + + if delta_message.content: + other_content += delta_message.content + + if delta_message.tool_calls: + for tool_call in delta_message.tool_calls: + idx = tool_call.index + + # Initialize state for new tool + if idx not in tool_states: + tool_states[idx] = { + "id": None, + "name": None, + "arguments": "", + "type": None + } + + # First chunk should have id, name, and type + if tool_call.id: + tool_states[idx]["id"] = tool_call.id + + if tool_call.type: + assert tool_call.type == "function" + tool_states[idx]["type"] = tool_call.type + + if tool_call.function: + if tool_call.function.name: + # Should only be set once + assert tool_states[idx]["name"] is None + tool_states[idx]["name"] = tool_call.function.name + + if tool_call.function.arguments is not None: + # Accumulate arguments incrementally + tool_states[idx][ + "arguments"] += tool_call.function.arguments + + # Verify final content + assert other_content == expected_content + + # Verify we got all expected tool calls + assert len(tool_states) == len(expected_tool_calls) + + # Verify each tool call + for idx, expected_tool in enumerate(expected_tool_calls): + state = tool_states[idx] + assert state["id"] is not None + assert state["type"] == "function" + assert state["name"] == expected_tool.function.name + + # Parse accumulated arguments + arguments_str = state["arguments"] + assert arguments_str is not None + actual_args = json.loads(arguments_str) + expected_args = json.loads(expected_tool.function.arguments) + assert actual_args == expected_args + + +def test_extract_tool_calls_streaming_incremental(qwen3_tool_parser, + qwen3_tokenizer, + sample_tools): + """Test that streaming is truly incremental""" + model_output = '''I'll check the weather. + + +Dallas + + +TX + + +''' + + request = ChatCompletionRequest(model=MODEL, + messages=[], + tools=sample_tools) + + chunks = [] + for delta_message in stream_delta_message_generator( + qwen3_tool_parser, qwen3_tokenizer, model_output, request): + chunks.append(delta_message) + + # Should have multiple chunks + assert len(chunks) > 3 + + # First chunk(s) should be content + assert chunks[0].content is not None + assert chunks[0].tool_calls is None or chunks[0].tool_calls == [] + + # Should have a chunk with tool header (id, name, type) + header_found = False + for chunk in chunks: + if chunk.tool_calls and chunk.tool_calls[0].id: + header_found = True + assert (chunk.tool_calls[0].function.name == "get_current_weather") + assert chunk.tool_calls[0].type == "function" + # Empty initially + assert chunk.tool_calls[0].function.arguments == "" + break + assert header_found + + # Should have chunks with incremental arguments + arg_chunks = [] + for chunk in chunks: + if chunk.tool_calls and chunk.tool_calls[0].function.arguments: + arg_chunks.append(chunk.tool_calls[0].function.arguments) + + # Arguments should be streamed incrementally + assert len(arg_chunks) > 1 + + # Concatenated arguments should form valid JSON + full_args = "".join(arg_chunks) + parsed_args = json.loads(full_args) + assert parsed_args["city"] == "Dallas" + assert parsed_args["state"] == "TX" diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 9eda7155f01f..88c8aa929b78 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -17,6 +17,7 @@ from .mistral_tool_parser import MistralToolParser from .phi4mini_tool_parser import Phi4MiniJsonToolParser from .pythonic_tool_parser import PythonicToolParser +from .qwen3coder_tool_parser import Qwen3CoderToolParser from .xlam_tool_parser import xLAMToolParser __all__ = [ @@ -38,4 +39,5 @@ "KimiK2ToolParser", "HunyuanA13BToolParser", "Glm4MoeModelToolParser", + "Qwen3CoderToolParser", ] diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py new file mode 100644 index 000000000000..cf4d0b231aee --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py @@ -0,0 +1,669 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +import uuid +from collections.abc import Sequence +from typing import Any, Optional, Union + +import regex as re + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + ChatCompletionToolsParam, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, ToolParserManager) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +@ToolParserManager.register_module(["qwen3_coder"]) +class Qwen3CoderToolParser(ToolParser): + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + self.current_tool_name_sent: bool = False + self.prev_tool_call_arr: list[dict] = [] + self.streamed_args_for_tool: list[str] = [] + + # Sentinel tokens for streaming mode + self.tool_call_start_token: str = "" + self.tool_call_end_token: str = "" + self.tool_call_prefix: str = "(.*?)", re.DOTALL) + self.tool_call_regex = re.compile( + r"(.*?)|(.*?)$", re.DOTALL) + self.tool_call_function_regex = re.compile( + r"|| str: + """Generate a unique tool call ID.""" + return f"call_{uuid.uuid4().hex[:24]}" + + def _reset_streaming_state(self): + """Reset all streaming state.""" + self.current_tool_index = 0 + self.is_tool_call_started = False + self.header_sent = False + self.current_tool_string_id = None + self.current_function_name = None + self.current_param_name = None + self.current_param_value = "" + self.param_count = 0 + self.in_param = False + self.in_function = False + self.accumulated_text = "" + self.json_started = False + self.json_closed = False + + def _parse_xml_function_call( + self, function_call_str: str, + tools: Optional[list[ChatCompletionToolsParam]] + ) -> Optional[ToolCall]: + + def get_arguments_config(func_name: str) -> dict: + if tools is None: + return {} + for config in tools: + if not hasattr(config, "type") or not ( + hasattr(config, "function") + and hasattr(config.function, "name")): + continue + if (config.type == "function" + and config.function.name == func_name): + if not hasattr(config.function, "parameters"): + return {} + params = config.function.parameters + if isinstance(params, dict) and "properties" in params: + return params["properties"] + elif isinstance(params, dict): + return params + else: + return {} + logger.warning("Tool '%s' is not defined in the tools list.", + func_name) + return {} + + def convert_param_value(param_value: str, param_name: str, + param_config: dict, func_name: str) -> Any: + # Handle null value for any type + if param_value.lower() == "null": + return None + + converted_value: Any + + if param_name not in param_config: + if param_config != {}: + logger.warning( + "Parsed parameter '%s' is not defined in the tool " + "parameters for tool '%s', directly returning the " + "string value.", param_name, func_name) + return param_value + + if (isinstance(param_config[param_name], dict) + and "type" in param_config[param_name]): + param_type = str( + param_config[param_name]["type"]).strip().lower() + else: + param_type = "string" + if param_type in [ + "string", "str", "text", "varchar", "char", "enum" + ]: + return param_value + elif (param_type.startswith("int") or param_type.startswith("uint") + or param_type.startswith("long") + or param_type.startswith("short") + or param_type.startswith("unsigned")): + try: + converted_value = int(param_value) + return converted_value + except ValueError: + logger.warning( + "Parsed value '%s' of parameter '%s' is not an " + "integer in tool '%s', degenerating to string.", + param_value, param_name, func_name) + return param_value + elif (param_type.startswith("num") + or param_type.startswith("float")): + try: + float_param_value = float(param_value) + converted_value = (float_param_value if float_param_value - + int(float_param_value) != 0 else + int(float_param_value)) + return converted_value + except ValueError: + logger.warning( + "Parsed value '%s' of parameter '%s' is not a float " + "in tool '%s', degenerating to string.", param_value, + param_name, func_name) + return param_value + elif param_type in ["boolean", "bool", "binary"]: + param_value = param_value.lower() + if param_value not in ["true", "false"]: + logger.warning( + "Parsed value '%s' of parameter '%s' is not a " + "boolean (`true` of `false`) in tool '%s', " + "degenerating to false.", param_value, param_name, + func_name) + return param_value == "true" + else: + if param_type == "object" or param_type.startswith("dict"): + try: + converted_value = json.loads(param_value) + return converted_value + except json.JSONDecodeError: + logger.warning( + "Parsed value '%s' of parameter '%s' is not a " + "valid JSON object in tool '%s', will try other " + "methods to parse it.", param_value, param_name, + func_name) + try: + converted_value = eval(param_value) + return converted_value + except Exception: + logger.warning( + "Parsed value '%s' of parameter '%s' cannot be " + "converted via Python `eval()` in tool '%s', " + "degenerating to string.", param_value, param_name, + func_name) + return param_value + + # Extract function name + end_index = function_call_str.index(">") + function_name = function_call_str[:end_index] + param_config = get_arguments_config(function_name) + parameters = function_call_str[end_index + 1:] + param_dict = {} + for match in self.tool_call_parameter_regex.findall(parameters): + match_text = match[0] if match[0] else match[1] + idx = match_text.index(">") + param_name = match_text[:idx] + param_value = str(match_text[idx + 1:]) + # Remove prefix and trailing \n + if param_value.startswith("\n"): + param_value = param_value[1:] + if param_value.endswith("\n"): + param_value = param_value[:-1] + + param_dict[param_name] = convert_param_value( + param_value, param_name, param_config, function_name) + return ToolCall( + type="function", + function=FunctionCall(name=function_name, + arguments=json.dumps(param_dict, + ensure_ascii=False)), + ) + + def _get_function_calls(self, model_output: str) -> list[str]: + # Find all tool calls + matched_ranges = self.tool_call_regex.findall(model_output) + raw_tool_calls = [ + match[0] if match[0] else match[1] for match in matched_ranges + ] + + # Back-off strategy if no tool_call tags found + if len(raw_tool_calls) == 0: + raw_tool_calls = [model_output] + + raw_function_calls = [] + for tool_call in raw_tool_calls: + raw_function_calls.extend( + self.tool_call_function_regex.findall(tool_call)) + + function_calls = [ + match[0] if match[0] else match[1] for match in raw_function_calls + ] + return function_calls + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + # Quick check to avoid unnecessary processing + if self.tool_call_prefix not in model_output: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + try: + function_calls = self._get_function_calls(model_output) + if len(function_calls) == 0: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + tool_calls = [ + self._parse_xml_function_call(function_call_str, request.tools) + for function_call_str in function_calls + ] + + # Populate prev_tool_call_arr for serving layer to set + # finish_reason + self.prev_tool_call_arr.clear() # Clear previous calls + for tool_call in tool_calls: + if tool_call: + self.prev_tool_call_arr.append({ + "name": + tool_call.function.name, + "arguments": + tool_call.function.arguments, + }) + + # Extract content before tool calls + content_index = model_output.find(self.tool_call_start_token) + content_index = (content_index if content_index >= 0 else + model_output.find(self.tool_call_prefix)) + content = model_output[:content_index] # .rstrip() + + return ExtractedToolCallInformation( + tools_called=(len(tool_calls) > 0), + tool_calls=tool_calls, + content=content if content else None, + ) + + except Exception: + logger.exception("Error in extracting tool call from response.") + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + # If no delta text, return None unless it's an EOS token after tool + # calls + if not delta_text: + # Check if this is an EOS token after all tool calls are complete + # We check for tool calls in the text even if is_tool_call_started + # is False because it might have been reset after processing all + # tools + if (delta_token_ids + and self.tool_call_end_token_id not in delta_token_ids): + # Count complete tool calls + complete_calls = len( + self.tool_call_complete_regex.findall(current_text)) + + # If we have completed tool calls and populated + # prev_tool_call_arr + if (complete_calls > 0 and len(self.prev_tool_call_arr) > 0): + # Check if all tool calls are closed + open_calls = ( + current_text.count(self.tool_call_start_token) - + current_text.count(self.tool_call_end_token)) + if open_calls == 0: + # Return empty delta message to allow finish_reason + # processing + return DeltaMessage(content="") + elif not self.is_tool_call_started and current_text: + # This is a regular content response that's now complete + return DeltaMessage(content="") + return None + + # Check if this is the first call (reset state if needed) + if not previous_text: + self._reset_streaming_state() + + # Update accumulated text + self.accumulated_text = current_text + + # Check if we need to advance to next tool + if self.json_closed and not self.in_function: + # Check if this tool call has ended + tool_ends = current_text.count(self.tool_call_end_token) + if tool_ends > self.current_tool_index: + # This tool has ended, advance to next + self.current_tool_index += 1 + self.header_sent = False + self.param_count = 0 + self.json_started = False + self.json_closed = False + + # Check if there are more tool calls + tool_starts_count = current_text.count( + self.tool_call_start_token) + if self.current_tool_index >= tool_starts_count: + # No more tool calls + self.is_tool_call_started = False + # Continue processing next tool + return None + + # Handle normal content before tool calls + if not self.is_tool_call_started: + # Check if tool call is starting + if (self.tool_call_start_token_id in delta_token_ids + or self.tool_call_start_token in delta_text): + self.is_tool_call_started = True + # Return any content before the tool call + if self.tool_call_start_token in delta_text: + content_before = delta_text[:delta_text.index( + self.tool_call_start_token)] + if content_before: + return DeltaMessage(content=content_before) + return None + else: + # Check if we're between tool calls - skip whitespace + if (current_text.rstrip().endswith(self.tool_call_end_token) + and delta_text.strip() == ""): + # We just ended a tool call, skip whitespace + return None + # Normal content, no tool call + return DeltaMessage(content=delta_text) + + # Check if we're between tool calls (waiting for next one) + # Count tool calls we've seen vs processed + tool_starts_count = current_text.count(self.tool_call_start_token) + if self.current_tool_index >= tool_starts_count: + # We're past all tool calls, shouldn't be here + return None + + # We're in a tool call, find the current tool call portion + # Need to find the correct tool call based on current_tool_index + tool_starts: list[int] = [] + idx = 0 + while True: + idx = current_text.find(self.tool_call_start_token, idx) + if idx == -1: + break + tool_starts.append(idx) + idx += len(self.tool_call_start_token) + + if self.current_tool_index >= len(tool_starts): + # No more tool calls to process yet + return None + + tool_start_idx = tool_starts[self.current_tool_index] + # Find where this tool call ends (or current position if not ended yet) + tool_end_idx = current_text.find(self.tool_call_end_token, + tool_start_idx) + if tool_end_idx == -1: + tool_text = current_text[tool_start_idx:] + else: + tool_text = current_text[tool_start_idx:tool_end_idx + + len(self.tool_call_end_token)] + + # Looking for function header + if not self.header_sent: + if self.tool_call_prefix in tool_text: + func_start = (tool_text.find(self.tool_call_prefix) + + len(self.tool_call_prefix)) + func_end = tool_text.find(">", func_start) + + if func_end != -1: + # Found complete function name + self.current_function_name = tool_text[func_start:func_end] + self.current_tool_string_id = self._generate_tool_call_id() + self.header_sent = True + self.in_function = True + + # IMPORTANT: Add to prev_tool_call_arr immediately when we + # detect a tool call. This ensures + # finish_reason="tool_calls" even if parsing isn't complete + already_added = any( + tool.get("name") == self.current_function_name + for tool in self.prev_tool_call_arr) + if not already_added: + self.prev_tool_call_arr.append({ + "name": self.current_function_name, + "arguments": + "{}", # Placeholder, will be updated later + }) + + # Send header with function info + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + id=self.current_tool_string_id, + function=DeltaFunctionCall( + name=self.current_function_name, arguments=""), + type="function", + ) + ]) + return None + + # We've sent header, now handle function body + if self.in_function: + # Send opening brace if not sent yet + if (not self.json_started + and self.parameter_prefix not in delta_text): + self.json_started = True + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall(arguments="{"), + ) + ]) + + # Make sure json_started is set if we're processing parameters + if not self.json_started: + self.json_started = True + + # Check for function end in accumulated text + if not self.json_closed and self.function_end_token in tool_text: + # Close JSON + self.json_closed = True + + # Extract the complete tool call to update prev_tool_call_arr + # with final arguments. Find the function content + func_start = (tool_text.find(self.tool_call_prefix) + + len(self.tool_call_prefix)) + func_content_end = tool_text.find(self.function_end_token, + func_start) + if func_content_end != -1: + func_content = tool_text[func_start:func_content_end] + # Parse to get the complete arguments + try: + parsed_tool = self._parse_xml_function_call( + func_content, request.tools if request else None) + if parsed_tool: + # Update existing entry in prev_tool_call_arr with + # complete arguments + for i, tool in enumerate(self.prev_tool_call_arr): + if (tool.get("name") == + parsed_tool.function.name): + self.prev_tool_call_arr[i]["arguments"] = ( + parsed_tool.function.arguments) + break + except Exception: + pass # Ignore parsing errors during streaming + + result = DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall(arguments="}"), + ) + ]) + + # Reset state for next tool + self.in_function = False + self.json_closed = True + + return result + + # Look for parameters + # Count how many complete parameters we have processed + complete_params = tool_text.count(self.parameter_end_token) + + # Check if we should start a new parameter + if not self.in_param and self.param_count < complete_params: + # Find the unprocessed parameter + # Count parameter starts + param_starts = [] + idx = 0 + while True: + idx = tool_text.find(self.parameter_prefix, idx) + if idx == -1: + break + param_starts.append(idx) + idx += len(self.parameter_prefix) + + if len(param_starts) > self.param_count: + # Process the next parameter + param_idx = param_starts[self.param_count] + param_start = param_idx + len(self.parameter_prefix) + remaining = tool_text[param_start:] + + if ">" in remaining: + # We have the complete parameter name + name_end = remaining.find(">") + self.current_param_name = remaining[:name_end] + + # Find the parameter value + value_start = param_start + name_end + 1 + value_text = tool_text[value_start:] + if value_text.startswith("\n"): + value_text = value_text[1:] + + # Find where this parameter ends + param_end_idx = value_text.find( + self.parameter_end_token) + if param_end_idx != -1: + # Complete parameter found + param_value = value_text[:param_end_idx] + if param_value.endswith("\n"): + param_value = param_value[:-1] + + # Build complete JSON fragment for this parameter + if self.param_count == 0: + json_fragment = ( + '"' + self.current_param_name + '": "' + + json.dumps(param_value)[1:-1] + '"') + else: + json_fragment = ( + ', "' + self.current_param_name + '": "' + + json.dumps(param_value)[1:-1] + '"') + + self.param_count += 1 + + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall( + arguments=json_fragment), + ) + ]) + + # Continue parameter value + if self.in_param: + if self.parameter_end_token in delta_text: + # End of parameter + end_idx = delta_text.find(self.parameter_end_token) + value_chunk = delta_text[:end_idx] + + # Skip past > if at start + if not self.current_param_value and ">" in value_chunk: + gt_idx = value_chunk.find(">") + value_chunk = value_chunk[gt_idx + 1:] + + if (not self.current_param_value + and value_chunk.startswith("\n")): + value_chunk = value_chunk[1:] + + # Calculate incremental JSON + full_value = self.current_param_value + value_chunk + prev_escaped = (json.dumps(self.current_param_value)[1:-1] + if self.current_param_value else "") + full_escaped = json.dumps(full_value)[1:-1] + delta_escaped = full_escaped[len(prev_escaped):] + + self.in_param = False + self.current_param_value = "" + + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall( + arguments=delta_escaped + '"'), + ) + ]) + else: + # Continue accumulating value + value_chunk = delta_text + + # Handle first chunk after param name + if not self.current_param_value and ">" in value_chunk: + gt_idx = value_chunk.find(">") + value_chunk = value_chunk[gt_idx + 1:] + + if (not self.current_param_value + and value_chunk.startswith("\n")): + value_chunk = value_chunk[1:] + + if value_chunk: + # Stream the escaped delta + prev_escaped = (json.dumps( + self.current_param_value)[1:-1] + if self.current_param_value else "") + self.current_param_value += value_chunk + full_escaped = json.dumps( + self.current_param_value)[1:-1] + delta_escaped = full_escaped[len(prev_escaped):] + + if delta_escaped: + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall( + arguments=delta_escaped), + ) + ]) + + return None From d64c0ff78d482c347e595b3957a3cac1387afd3b Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Tue, 22 Jul 2025 16:18:42 -0700 Subject: [PATCH 37/63] [Misc] Copy HF_TOKEN env var to Ray workers (#21406) Signed-off-by: Rui Qiao Signed-off-by: qizixi --- vllm/executor/ray_distributed_executor.py | 6 +++++- vllm/ray/ray_env.py | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 417750a08c69..e9ad62aeb99a 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -58,6 +58,9 @@ class RayDistributedExecutor(DistributedExecutorBase): "VLLM_HOST_IP", "VLLM_HOST_PORT", "LOCAL_RANK", "CUDA_VISIBLE_DEVICES" } + # These non-vLLM env vars are copied from the driver to workers + ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"} + uses_ray: bool = True def _init_executor(self) -> None: @@ -326,7 +329,8 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData): # Environment variables to copy from driver to workers env_vars_to_copy = get_env_vars_to_copy( exclude_vars=self.WORKER_SPECIFIC_ENV_VARS, - additional_vars=set(current_platform.additional_env_vars), + additional_vars=set(current_platform.additional_env_vars).union( + self.ADDITIONAL_ENV_VARS), destination="workers") # Copy existing env vars to each worker's args diff --git a/vllm/ray/ray_env.py b/vllm/ray/ray_env.py index 716d0bfafae5..f6a994bb3c22 100644 --- a/vllm/ray/ray_env.py +++ b/vllm/ray/ray_env.py @@ -43,6 +43,8 @@ def get_env_vars_to_copy(exclude_vars: Optional[set[str]] = None, exclude_vars: A set of vllm defined environment variables to exclude from copying. additional_vars: A set of additional environment variables to copy. + If a variable is in both exclude_vars and additional_vars, it will + be excluded. destination: The destination of the environment variables. Returns: A set of environment variables to copy. @@ -52,10 +54,9 @@ def get_env_vars_to_copy(exclude_vars: Optional[set[str]] = None, env_vars_to_copy = { v - for v in envs.environment_variables + for v in set(envs.environment_variables).union(additional_vars) if v not in exclude_vars and v not in RAY_NON_CARRY_OVER_ENV_VARS } - env_vars_to_copy.update(additional_vars) to_destination = " to " + destination if destination is not None else "" From b2f76139db543f825b859b7527764ee00849e20f Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Tue, 22 Jul 2025 17:19:55 -0600 Subject: [PATCH 38/63] [BugFix] Fix ray import error mem cleanup bug (#21381) Signed-off-by: Travis Johnson Signed-off-by: Joe Runde Co-authored-by: Travis Johnson Signed-off-by: qizixi --- vllm/config.py | 5 +++-- vllm/executor/ray_utils.py | 8 +++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index d649eb75033f..6623a48f839a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2137,10 +2137,11 @@ def __post_init__(self) -> None: elif (current_platform.is_cuda() and cuda_device_count_stateless() < self.world_size): if not ray_found: - raise ValueError("Unable to load Ray which is " + raise ValueError("Unable to load Ray: " + f"{ray_utils.ray_import_err}. Ray is " "required for multi-node inference, " "please install Ray with `pip install " - "ray`.") from ray_utils.ray_import_err + "ray`.") backend = "ray" elif self.data_parallel_backend == "ray": logger.info("Using ray distributed inference because " diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index c222f1609096..033ecc00853b 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -145,7 +145,9 @@ def override_env_vars(self, vars: Dict[str, str]): except ImportError as e: ray = None # type: ignore - ray_import_err = e + # only capture string to avoid variable references in the traceback that can + # prevent garbage collection in some cases + ray_import_err = str(e) RayWorkerWrapper = None # type: ignore @@ -157,8 +159,8 @@ def ray_is_available() -> bool: def assert_ray_available(): """Raise an exception if Ray is not available.""" if ray is None: - raise ValueError("Failed to import Ray, please install Ray with " - "`pip install ray`.") from ray_import_err + raise ValueError(f"Failed to import Ray: {ray_import_err}." + "Please install Ray with `pip install ray`.") def _verify_bundles(placement_group: "PlacementGroup", From 74d8cbc5bc7481dc55513d4c7e87ec206ca4c08b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 23 Jul 2025 11:25:37 +0800 Subject: [PATCH 39/63] [CI/Build] Fix model executor tests (#21387) Signed-off-by: DarkLight1337 Signed-off-by: qizixi --- .buildkite/test-pipeline.yaml | 1 - tests/model_executor/test_model_load_with_params.py | 13 +++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c476f71c6637..f4b69fa21ec4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -434,7 +434,6 @@ steps: - label: Model Executor Test mirror_hardwares: [amdexperimental, amdproduction] - soft_fail: true source_file_dependencies: - vllm/model_executor - tests/model_executor diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 273747630215..aae9a4d1ef11 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -5,7 +5,8 @@ import pytest -from vllm.model_executor.layers.pooler import CLSPool, MeanPool, PoolingType +from vllm.model_executor.layers.pooler import (CLSPool, DispatchPooler, + MeanPool, PoolingType) from vllm.model_executor.models.bert import BertEmbeddingModel from vllm.model_executor.models.roberta import RobertaEmbeddingModel from vllm.platforms import current_platform @@ -49,7 +50,8 @@ def test_model_loading_with_params(vllm_runner): def check_model(model): assert isinstance(model, BertEmbeddingModel) - assert isinstance(model.pooler.pooling, CLSPool) + assert isinstance(pooler := model.pooler, DispatchPooler) + assert isinstance(pooler.poolers_by_task["embed"].pooling, CLSPool) vllm_model.apply_model(check_model) @@ -87,7 +89,9 @@ def test_roberta_model_loading_with_params(vllm_runner): def check_model(model): assert isinstance(model, RobertaEmbeddingModel) - assert isinstance(model.pooler.pooling, MeanPool) + assert isinstance(pooler := model.pooler, DispatchPooler) + assert isinstance(pooler.poolers_by_task["embed"].pooling, + MeanPool) vllm_model.apply_model(check_model) @@ -114,7 +118,8 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner): def check_model(model): assert isinstance(model, RobertaEmbeddingModel) assert not hasattr(model, "lm_head") - assert isinstance(model.pooler.pooling, CLSPool) + assert isinstance(pooler := model.pooler, DispatchPooler) + assert isinstance(pooler.poolers_by_task["embed"].pooling, CLSPool) vllm_model.apply_model(check_model) From 701a33156ea7716028368a04f52651a933080f29 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Tue, 22 Jul 2025 23:27:41 -0400 Subject: [PATCH 40/63] [Bugfix][ROCm][Build] Fix build regression on ROCm (#21393) Signed-off-by: Gregory Shtrasberg Signed-off-by: qizixi --- CMakeLists.txt | 4 ++-- csrc/ops.h | 10 +++++----- csrc/torch_bindings.cpp | 18 +++++++++--------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 767e9ad7541b..98ed682fee7d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -245,7 +245,6 @@ set(VLLM_EXT_SRC "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/fp8/common.cu" - "csrc/quantization/fp8/per_token_group_quant.cu" "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/activation_kernels.cu" @@ -297,7 +296,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu" "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" "csrc/cutlass_extensions/common.cpp" - "csrc/attention/mla/cutlass_mla_entry.cu") + "csrc/attention/mla/cutlass_mla_entry.cu" + "csrc/quantization/fp8/per_token_group_quant.cu") set_gencode_flags_for_srcs( SRCS "${VLLM_EXT_SRC}" diff --git a/csrc/ops.h b/csrc/ops.h index fdd3071c56ef..97a247d9d628 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -287,6 +287,11 @@ void scaled_fp4_experts_quant( torch::Tensor const& input, torch::Tensor const& input_global_scale, torch::Tensor const& input_offset_by_experts, torch::Tensor const& output_scale_offset_by_experts); + +void per_token_group_quant_fp8(const torch::Tensor& input, + torch::Tensor& output_q, torch::Tensor& output_s, + int64_t group_size, double eps, double fp8_min, + double fp8_max, bool scale_ue8m0); #endif void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, @@ -297,11 +302,6 @@ void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scales, std::optional const& azp); -void per_token_group_quant_fp8(const torch::Tensor& input, - torch::Tensor& output_q, torch::Tensor& output_s, - int64_t group_size, double eps, double fp8_min, - double fp8_max, bool scale_ue8m0); - torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight, torch::Tensor b_gptq_qzeros, torch::Tensor b_gptq_scales, torch::Tensor b_g_idx, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index d310211afe43..95f8541bc9e2 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -601,15 +601,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("dynamic_scaled_int8_quant", torch::kCUDA, &dynamic_scaled_int8_quant); - // Compute per-token-group FP8 quantized tensor and scaling factor. - ops.def( - "per_token_group_fp8_quant(Tensor input, Tensor! output_q, Tensor! " - "output_s, " - "int group_size, float eps, float fp8_min, float fp8_max, bool " - "scale_ue8m0) -> ()"); - ops.impl("per_token_group_fp8_quant", torch::kCUDA, - &per_token_group_quant_fp8); - // Mamba selective scan kernel ops.def( "selective_scan_fwd(Tensor! u, Tensor! delta," @@ -624,6 +615,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); #ifndef USE_ROCM + // Compute per-token-group FP8 quantized tensor and scaling factor. + ops.def( + "per_token_group_fp8_quant(Tensor input, Tensor! output_q, Tensor! " + "output_s, " + "int group_size, float eps, float fp8_min, float fp8_max, bool " + "scale_ue8m0) -> ()"); + ops.impl("per_token_group_fp8_quant", torch::kCUDA, + &per_token_group_quant_fp8); + // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel ops.def( "rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, " From 7c61321e1f0b9bc5a614078a10ca868ed56f5f10 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 23 Jul 2025 04:29:43 +0100 Subject: [PATCH 41/63] Simplify weight loading in Transformers backend (#21382) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: qizixi --- tests/distributed/test_pipeline_parallel.py | 4 +- tests/lora/test_transformers_model.py | 2 +- tests/models/registry.py | 2 +- tests/models/test_transformers.py | 2 +- vllm/model_executor/models/interfaces.py | 10 +- vllm/model_executor/models/transformers.py | 107 ++++++++------------ vllm/test_utils.py | 2 +- 7 files changed, 53 insertions(+), 76 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 926a33c949eb..2391430a083a 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -177,7 +177,7 @@ def iter_params(self, model_id: str): "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(), "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(), # Tests TransformersForCausalLM - "ArthurZ/Ilama-3.2-1B": PPTestSettings.fast(), + "hmellor/Ilama-3.2-1B": PPTestSettings.fast(), "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(), "openbmb/MiniCPM3-4B": PPTestSettings.fast(), # Uses Llama @@ -249,7 +249,7 @@ def iter_params(self, model_id: str): # [LANGUAGE GENERATION] "microsoft/Phi-3.5-MoE-instruct", "meta-llama/Llama-3.2-1B-Instruct", - "ArthurZ/Ilama-3.2-1B", + "hmellor/Ilama-3.2-1B", "ibm/PowerLM-3b", "deepseek-ai/DeepSeek-V2-Lite-Chat", # [LANGUAGE EMBEDDING] diff --git a/tests/lora/test_transformers_model.py b/tests/lora/test_transformers_model.py index 5065a2fb7164..723f7a54778f 100644 --- a/tests/lora/test_transformers_model.py +++ b/tests/lora/test_transformers_model.py @@ -9,7 +9,7 @@ from ..utils import create_new_process_for_each_test, multi_gpu_test -MODEL_PATH = "ArthurZ/ilama-3.2-1B" +MODEL_PATH = "hmellor/Ilama-3.2-1B" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 diff --git a/tests/models/registry.py b/tests/models/registry.py index 776b4c033564..257ca36db3a0 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -500,7 +500,7 @@ def check_available_online( } _TRANSFORMERS_MODELS = { - "TransformersForCausalLM": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501 + "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501 "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"), } diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 16b9bcffd265..cd5b6193d001 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -56,7 +56,7 @@ def check_implementation( "model,model_impl", [ ("meta-llama/Llama-3.2-1B-Instruct", "transformers"), - ("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE + ("hmellor/Ilama-3.2-1B", "auto"), # CUSTOM CODE ]) # trust_remote_code=True by default def test_models( hf_runner: type[HfRunner], diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 7f3efde43474..8f6a7db7aa8d 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -624,13 +624,9 @@ def __new__(cls, *args, **kwargs) -> Self: instance.quant_config = quant_config # apply model mappings to config for proper config-model matching - # NOTE: `TransformersForCausalLM` is not supported due to how this - # class defines `hf_to_vllm_mapper` as a post-init `@property`. - # After this is fixed, get `instance.hf_to_vllm_mapper` directly - if getattr(instance, "hf_to_vllm_mapper", None) is not None: - instance.quant_config.apply_vllm_mapper( - instance.hf_to_vllm_mapper) - if getattr(instance, "packed_modules_mapping", None) is not None: + if (hf_to_vllm_mapper := instance.hf_to_vllm_mapper) is not None: + instance.quant_config.apply_vllm_mapper(hf_to_vllm_mapper) + if instance.packed_modules_mapping is not None: instance.quant_config.packed_modules_mapping.update( instance.packed_modules_mapping) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index cb9d28b10672..610f8e752dbd 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -414,7 +414,7 @@ def __exit__(self, exc_type, exc_value, traceback): setattr(self.config, key, value) -class TransformersModel(nn.Module): +class TransformersModel: def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -454,9 +454,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # method after v4.54.0 is released self.text_config._attn_implementation = "vllm" with init_on_device_without_buffers("meta"), config_override: - # FIXME(Isotr0py): We need to refactor this part in the future to - # avoid registering an extra model layer, otherwise we will need a - # weights mapper to rename weights. self.model: PreTrainedModel = AutoModel.from_config( config, torch_dtype=model_config.dtype, @@ -620,9 +617,6 @@ def init_parameters(self, module: nn.Module): for child in module.children(): self.init_parameters(child) - def get_input_embeddings(self) -> nn.Module: - return self.model.get_input_embeddings() - def forward( self, input_ids: Optional[torch.Tensor], @@ -694,7 +688,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config - self.model = TransformersModel(vllm_config=vllm_config, prefix=prefix) + self.transformers_model = TransformersModel(vllm_config=vllm_config, + prefix=prefix) + self.model = self.transformers_model.model if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size @@ -716,22 +712,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = PPMissingLayer() self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - # FIXME(Isotr0py): Don't use any weights mapper for Transformers backend, - # this makes thing complicated. We need to remove this mapper after refactor - # `TransformersModel` in the future. - # NOTE: `SupportsQuant` can be updated after property decorator is removed - @property - def hf_to_vllm_mapper(self): - prefix_mapper = { - name: "model." + name - for name, _ in self.model.model.named_children() - } - return WeightsMapper( - orig_to_new_substr={"model.": "model.model."}, - orig_to_new_prefix=prefix_mapper, - ) + self.transformers_model.make_empty_intermediate_tensors) def forward( self, @@ -740,8 +721,9 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - model_output = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds) + model_output = self.transformers_model.forward(input_ids, positions, + intermediate_tensors, + inputs_embeds) return model_output def compute_logits( @@ -755,12 +737,10 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - skip_prefixes=(["lm_head."] - if self.config.tie_word_embeddings else None), - ) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + skip_prefixes = ["lm_head." + ] if self.config.tie_word_embeddings else None + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) + return loader.load_weights(weights) @MULTIMODAL_REGISTRY.register_processor( @@ -772,6 +752,29 @@ class TransformersForMultimodalLM(nn.Module, SupportsQuant, SupportsLoRA, embedding_padding_modules = ["lm_head"] embedding_modules = ["embed_tokens"] + # Backwards compatibility for prev released models. State dicts back then + # had different formats and cannot be loaded with `AutoModel` mapping as is + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "language_model.model": "model.language_model", + "text_model.model": "model.text_model", + "vision_tower": "model.vision_tower", + "vqmodel": "model.vqmodel", + "visual": "model.visual", + "vision_model": "model.vision_model", + "vision_embed_tokens": "model.vision_embed_tokens", + "image_newline": "model.image_newline", + "multi_modal_projector": "model.multi_modal_projector", + "text_model.lm_head": "lm_head", + "language_model.lm_head": "lm_head", + # Qwen models used "model" as the name for the language model. + # Therefore, we must map each of submodule explicitly to avoid + # conflicts with newer models that use "model.language_model". + "model.embed_tokens": "model.language_model.embed_tokens", + "model.layers": "model.language_model.layers", + "model.norm": "model.language_model.norm", + }) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: PretrainedConfig = vllm_config.model_config.hf_config @@ -780,7 +783,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.dtype = vllm_config.model_config.dtype - self.model = TransformersModel(vllm_config=vllm_config, prefix=prefix) + self.transformers_model = TransformersModel(vllm_config=vllm_config, + prefix=prefix) + self.model = self.transformers_model.model text_config = config.get_text_config() if get_pp_group().is_last_rank: @@ -803,32 +808,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = PPMissingLayer() self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - @property - def hf_to_vllm_mapper(self): - # Backwards compatibility for prev released models - # State dicts back then had different formats - # and cannot be loaded with `AutoModel` mapping - # as is - prefix_mapper = { - "language_model.model": "model.language_model", - "text_model.model": "model.text_model", - "vision_tower": "model.vision_tower", - "vqmodel": "model.vqmodel", - "vision_model": "model.vision_model", - "vision_embed_tokens": "model.vision_embed_tokens", - "image_newline": "model.image_newline", - "multi_modal_projector": "model.multi_modal_projector", - "text_model.lm_head": "lm_head", - "language_model.lm_head": "lm_head", - } - # Don't change the order for QwenVL - if 'Qwen2' in self.config.__class__.__name__: - prefix_mapper["model"] = "model.language_model" - prefix_mapper["visual"] = "model.visual" - - return WeightsMapper(orig_to_new_prefix=prefix_mapper, ) + self.transformers_model.make_empty_intermediate_tensors) def forward( self, @@ -848,8 +828,9 @@ def forward( input_ids, multimodal_embeds) input_ids = None - model_output = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds) + model_output = self.transformers_model.forward(input_ids, positions, + intermediate_tensors, + inputs_embeds) return model_output def compute_logits( @@ -898,7 +879,7 @@ def get_multimodal_embeddings(self, **kwargs): if isinstance(num_image_patches, list): num_image_patches = torch.cat(num_image_patches) - vision_embeddings = self.model.model.get_image_features( + vision_embeddings = self.model.get_image_features( pixel_values, **{ k: v.flatten(0, 1) @@ -928,7 +909,7 @@ def get_input_embeddings( input_ids: torch.Tensor, multimodal_embeddings=None, ) -> torch.Tensor: - inputs_embeds = self.model.model.get_input_embeddings()(input_ids) + inputs_embeds = self.model.get_input_embeddings()(input_ids) if (multimodal_embeddings is not None and len(multimodal_embeddings) != 0): mask = (input_ids == self.config.image_token_id) diff --git a/vllm/test_utils.py b/vllm/test_utils.py index c6b126d002b2..1e61ca6b3dea 100644 --- a/vllm/test_utils.py +++ b/vllm/test_utils.py @@ -10,7 +10,7 @@ "allenai/OLMoE-1B-7B-0924-Instruct", "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test", "AMead10/Llama-3.2-1B-Instruct-AWQ", - "ArthurZ/Ilama-3.2-1B", + "hmellor/Ilama-3.2-1B", "BAAI/bge-base-en-v1.5", "BAAI/bge-multilingual-gemma2", "BAAI/bge-reranker-v2-m3", From f8e8456406cbe3230da1f8b82c44b6086b1a3593 Mon Sep 17 00:00:00 2001 From: ericehanley Date: Tue, 22 Jul 2025 22:33:00 -0500 Subject: [PATCH 42/63] [BugFix] Update python to python3 calls for image; fix prefix & input calculations. (#21391) Signed-off-by: Eric Hanley Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: qizixi --- benchmarks/auto_tune/auto_tune.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 159ee1421475..eaa28ea5c92b 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -126,11 +126,12 @@ run_benchmark() { # get a basic qps by using request-rate inf bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) - python benchmarks/benchmark_serving.py \ +adjusted_input_len=$(( INPUT_LEN - prefix_len )) + python3 benchmarks/benchmark_serving.py \ --backend vllm \ --model $MODEL \ --dataset-name random \ - --random-input-len $INPUT_LEN \ + --random-input-len $adjusted_input_len \ --random-output-len $OUTPUT_LEN \ --ignore-eos \ --disable-tqdm \ @@ -159,11 +160,11 @@ run_benchmark() { curl -X POST http://0.0.0.0:8004/reset_prefix_cache sleep 5 bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" - python benchmarks/benchmark_serving.py \ + python3 benchmarks/benchmark_serving.py \ --backend vllm \ --model $MODEL \ --dataset-name random \ - --random-input-len $INPUT_LEN \ + --random-input-len $adjusted_input_len \ --random-output-len $OUTPUT_LEN \ --ignore-eos \ --disable-tqdm \ From 062ac7122b90093c3ea4429d9e5e12afde349bec Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 22 Jul 2025 22:33:57 -0500 Subject: [PATCH 43/63] [BUGFIX] deepseek-v2-lite failed due to fused_qkv_a_proj name update (#21414) Signed-off-by: Chendi.Xue Signed-off-by: qizixi --- vllm/model_executor/models/deepseek_v2.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 649109777b3f..79ddd3d0f627 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -885,13 +885,16 @@ def load_weights(self, weights: Iterable[tuple[str, # for mlp.experts[0].gate_gate_up_proj, which breaks load. if (("mlp.experts." in name) and name not in params_dict): continue - name = name.replace(weight_name, param_name) + name_mapped = name.replace(weight_name, param_name) # QKV fusion is optional, fall back to normal # weight loading if it's not enabled + # if go with fusion option, then update name if ((param_name == "fused_qkv_a_proj") - and name not in params_dict): + and name_mapped not in params_dict): continue + else: + name = name_mapped # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue From 44653f8ffe4bba9d893a300751ddf560ac73531d Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Wed, 23 Jul 2025 11:34:50 +0800 Subject: [PATCH 44/63] [Bugfix][CUDA] fixes CUDA FP8 kv cache dtype supported (#21420) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Signed-off-by: qizixi --- vllm/platforms/cuda.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index cc2543538d0d..9a8941e3cdd1 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -456,6 +456,19 @@ def stateless_init_device_torch_dist_pg( def device_count(cls) -> int: return cuda_device_count_stateless() + @classmethod + def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool: + fp8_attention = kv_cache_dtype.startswith("fp8") + will_use_fa = (not envs.is_set("VLLM_ATTENTION_BACKEND") + ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1" + supported = False + if cls.is_device_capability(100): + supported = True + elif fp8_attention and will_use_fa: + from vllm.attention.utils.fa_utils import flash_attn_supports_fp8 + supported = flash_attn_supports_fp8() + return supported + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, @@ -583,19 +596,6 @@ def is_fully_connected(cls, physical_device_ids: list[int]) -> bool: " not found. Assuming no NVLink available.") return False - @classmethod - def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool: - fp8_attention = kv_cache_dtype.startswith("fp8") - will_use_fa = (not envs.is_set("VLLM_ATTENTION_BACKEND") - ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1" - supported = False - if cls.is_device_capability(100): - supported = True - elif fp8_attention and will_use_fa: - from vllm.attention.utils.fa_utils import flash_attn_supports_fp8 - supported = flash_attn_supports_fp8() - return supported - # Autodetect either NVML-enabled or non-NVML platform # based on whether NVML is available. From 071801e4b33c1bdf4b55924cb351ba9194d22d19 Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Tue, 22 Jul 2025 22:48:31 -0500 Subject: [PATCH 45/63] Changing "amdproduction" allocation. (#21409) Signed-off-by: Alexei V. Ivanov Signed-off-by: qizixi --- .buildkite/test-pipeline.yaml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f4b69fa21ec4..00608229b95e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -225,7 +225,7 @@ steps: ##### 1 GPU test ##### - label: Regression Test # 5min - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: - vllm/ - tests/test_regression @@ -277,7 +277,7 @@ steps: - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - label: Examples Test # 25min - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints @@ -311,7 +311,7 @@ steps: - label: Platform Tests (CUDA) - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: - vllm/ - tests/cuda @@ -330,7 +330,7 @@ steps: - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - label: LoRA Test %N # 15min each - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/lora - tests/lora @@ -382,7 +382,7 @@ steps: - pytest -v -s kernels/core - label: Kernels Attention Test %N - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/attention/ - vllm/attention @@ -393,7 +393,7 @@ steps: parallelism: 2 - label: Kernels Quantization Test %N - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/quantization/ - vllm/model_executor/layers/quantization @@ -412,7 +412,7 @@ steps: - pytest -v -s kernels/moe - label: Kernels Mamba Test - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: - csrc/mamba/ - tests/kernels/mamba @@ -420,7 +420,7 @@ steps: - pytest -v -s kernels/mamba - label: Tensorizer Test # 11min - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] soft_fail: true source_file_dependencies: - vllm/model_executor/model_loader @@ -490,7 +490,7 @@ steps: - pytest -s entrypoints/openai/correctness/ - label: Encoder Decoder tests # 5min - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: - vllm/ - tests/encoder_decoder @@ -498,7 +498,7 @@ steps: - pytest -v -s encoder_decoder - label: OpenAI-Compatible Tool Use # 20 min - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] fast_check: false source_file_dependencies: - vllm/ @@ -610,7 +610,7 @@ steps: - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - label: Quantized Models Test - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor/layers/quantization - tests/models/quantization From a7f791dfeed00912231f40bc7ec592ac7494c90b Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 23 Jul 2025 15:01:01 +0800 Subject: [PATCH 46/63] [Bugfix] Fix nightly transformers CI failure (#21427) Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: qizixi --- tests/models/registry.py | 12 ++-- vllm/model_executor/models/tarsier.py | 6 +- vllm/transformers_utils/config.py | 2 + vllm/transformers_utils/configs/__init__.py | 2 + .../transformers_utils/configs/nemotron_vl.py | 56 +++++++++++++++++++ 5 files changed, 67 insertions(+), 11 deletions(-) create mode 100644 vllm/transformers_utils/configs/nemotron_vl.py diff --git a/tests/models/registry.py b/tests/models/registry.py index 257ca36db3a0..1eb7f7b9d829 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -443,6 +443,12 @@ def check_available_online( hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501 "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501 hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501 + "VoxtralForConditionalGeneration": _HfExamplesInfo( + "mistralai/Voxtral-Mini-3B-2507", + min_transformers_version="4.54", + # disable this temporarily until we support HF format + is_available_online=False, + ), # [Encoder-decoder] # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer # Therefore, we borrow the BartTokenizer from the original Bart model @@ -450,13 +456,7 @@ def check_available_online( tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501 trust_remote_code=True), # noqa: E501 "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 - "VoxtralForConditionalGeneration": _HfExamplesInfo( - "mistralai/Voxtral-Mini-3B-2507", - tokenizer_mode="mistral", - min_transformers_version="4.54" - ), "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 - # [Cross-encoder] "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501 } diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 25f026e9bef8..979d789b330c 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -13,8 +13,7 @@ from transformers import PretrainedConfig, SiglipVisionConfig from transformers.image_utils import ImageInput, get_image_size, to_numpy_array from transformers.models.llava import LlavaProcessor -from transformers.processing_utils import (ProcessingKwargs, Unpack, - _validate_images_text_input_order) +from transformers.processing_utils import ProcessingKwargs, Unpack from transformers.tokenization_utils_base import PreTokenizedInput, TextInput from vllm.config import VllmConfig @@ -94,9 +93,6 @@ def __call__( raise ValueError( "You have to specify at least one of `images` or `text`.") - # check if images and text inputs are reversed for BC - images, text = _validate_images_text_input_order(images, text) - output_kwargs = self._merge_kwargs( TarsierProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2e66dc16b47a..8d1f59e6eadf 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -37,6 +37,7 @@ MiniMaxText01Config, MiniMaxVL01Config, MllamaConfig, MLPSpeculatorConfig, MPTConfig, + Nemotron_Nano_VL_Config, NemotronConfig, NVLM_D_Config, OvisConfig, RWConfig, SkyworkR1VChatConfig, SolarConfig, @@ -80,6 +81,7 @@ def _get_hf_token() -> Optional[str]: "dbrx": DbrxConfig, "deepseek_vl_v2": DeepseekVLV2Config, "kimi_vl": KimiVLConfig, + "Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config, "mpt": MPTConfig, "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 5d84d648f1c5..89303213a27e 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -23,6 +23,7 @@ from vllm.transformers_utils.configs.mpt import MPTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig +from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config from vllm.transformers_utils.configs.ovis import OvisConfig from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig @@ -50,6 +51,7 @@ "KimiVLConfig", "NemotronConfig", "NemotronHConfig", + "Nemotron_Nano_VL_Config", "NVLM_D_Config", "OvisConfig", "SkyworkR1VChatConfig", diff --git a/vllm/transformers_utils/configs/nemotron_vl.py b/vllm/transformers_utils/configs/nemotron_vl.py new file mode 100644 index 000000000000..6a642f26b82a --- /dev/null +++ b/vllm/transformers_utils/configs/nemotron_vl.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# yapf: disable +# ruff: noqa: E501 +# Adapted from +# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py +# -------------------------------------------------------- +# Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License +# LICENSE is in incl_licenses directory. +# -------------------------------------------------------- + +from transformers import LlamaConfig +from transformers.configuration_utils import PretrainedConfig +from transformers.dynamic_module_utils import get_class_from_dynamic_module + + +class Nemotron_Nano_VL_Config(PretrainedConfig): + model_type = 'Llama_Nemotron_Nano_VL' + is_composition = True + + def __init__( + self, + vision_config=None, + llm_config=None, + force_image_size=None, + downsample_ratio=0.5, + template=None, + ps_version='v1', + image_tag_type="internvl", + projector_hidden_size=4096, + vit_hidden_size=1280, + **kwargs + ): + super().__init__(**kwargs) + + if vision_config is not None: + assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"] + vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1]) + self.vision_config = vision_auto_config(**vision_config) + else: + self.vision_config = PretrainedConfig() + + if llm_config is None: + self.text_config = LlamaConfig() + else: + self.text_config = LlamaConfig(**llm_config) + + # Assign configuration values + self.force_image_size = force_image_size + self.downsample_ratio = downsample_ratio + self.template = template # TODO move out of here and into the tokenizer + self.ps_version = ps_version # Pixel shuffle version + self.image_tag_type = image_tag_type # TODO: into the tokenizer too? + self.projector_hidden_size = projector_hidden_size + self.vit_hidden_size = vit_hidden_size From c6e12ff697572140a2d9d5291f78895d1a591fbd Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Wed, 23 Jul 2025 00:02:02 -0700 Subject: [PATCH 47/63] [Core] Add basic unit test for maybe_evict_cached_block (#21400) Signed-off-by: Jialin Ouyang Signed-off-by: qizixi --- tests/v1/core/test_prefix_caching.py | 67 ++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index b7f583de1f63..085616303d85 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -1097,6 +1097,73 @@ def test_prefix_cache_stats_disabled(): assert manager.prefix_cache_stats is None +def test_maybe_evict_cached_block(): + pool = BlockPool(num_gpu_blocks=4, enable_caching=True) + block_hash0 = BlockHashWithGroupId(block_hash=BlockHash(hash_value=10, + token_ids=(100, )), + group_id=1000) + block_hash1 = BlockHashWithGroupId(block_hash=BlockHash(hash_value=20, + token_ids=(200, )), + group_id=2000) + block_hash2 = BlockHashWithGroupId(block_hash=BlockHash(hash_value=30, + token_ids=(300, )), + group_id=3000) + block_hashes = [ + block_hash0, + block_hash1, + block_hash2, + # block3 had the exact same block_hash as the first block + block_hash0, + ] + assert len(pool.blocks) == len(block_hashes) + # Manually add all blocks to cached_blocks + for block, block_hash in zip(pool.blocks, block_hashes): + block.block_hash = block_hash + pool.cached_block_hash_to_block[block_hash][block.block_id] = block + + block0, block1, block2, block3 = pool.blocks + assert pool.cached_block_hash_to_block == { + block_hash0: { + block0.block_id: block0, + block3.block_id: block3 + }, + block_hash1: { + block1.block_id: block1 + }, + block_hash2: { + block2.block_id: block2 + } + } + # Evict block1 + pool._maybe_evict_cached_block(block1) + assert pool.cached_block_hash_to_block == { + block_hash0: { + block0.block_id: block0, + block3.block_id: block3 + }, + block_hash2: { + block2.block_id: block2 + } + } + # Evict block0: block_hash0 entry should NOT be removed, as block3 + # also use the same hash + pool._maybe_evict_cached_block(block0) + assert pool.cached_block_hash_to_block == { + block_hash0: { + block3.block_id: block3 + }, + block_hash2: { + block2.block_id: block2 + } + } + # Evict block2 + pool._maybe_evict_cached_block(block2) + assert pool.cached_block_hash_to_block == {block_hash0: {3: block3}} + # Evict block3 + pool._maybe_evict_cached_block(block3) + assert pool.cached_block_hash_to_block == {} + + @pytest.mark.parametrize("blocks_to_cache", [2, 3, 10]) def test_kv_cache_events(blocks_to_cache: int): block_size = 16 From 97c24f6713c31bc0e04b6ed60b02c22e8ead4e09 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 23 Jul 2025 03:02:48 -0400 Subject: [PATCH 48/63] [Cleanup] Only log MoE DP setup warning if DP is enabled (#21315) Signed-off-by: mgoin Signed-off-by: qizixi --- vllm/model_executor/layers/fused_moe/config.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 51c421bd228f..f5ed2861b8fc 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -464,10 +464,11 @@ def make( ) else: _quant_config = FusedMoEQuantConfig() - logger.warning_once("MoE DP setup unable to determine " - "quantization scheme or unsupported " - "quantization type. This model will " - "not run with DP enabled.") + if moe_parallel_config.dp_size > 1: + logger.warning_once("MoE DP setup unable to determine " + "quantization scheme or unsupported " + "quantization type. This model will " + "not run with DP enabled.") else: _quant_config = quant_config From d840c8a2b061c9b799f5ce9e14719e5557452a11 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 23 Jul 2025 15:03:16 +0800 Subject: [PATCH 49/63] add clear messages for deprecated models (#21424) Signed-off-by: youkaichao Signed-off-by: qizixi --- vllm/model_executor/model_loader/utils.py | 11 ++++++++++- vllm/model_executor/models/registry.py | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 42c5512905f2..4b30336f0132 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -25,7 +25,8 @@ as_reward_model, as_seq_cls_model) from vllm.model_executor.models.interfaces import SupportsQuant -from vllm.model_executor.models.registry import _TRANSFORMERS_MODELS +from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS, + _TRANSFORMERS_MODELS) from vllm.utils import is_pin_memory_available logger = init_logger(__name__) @@ -261,6 +262,14 @@ def get_model_architecture( vllm_not_supported = False break + if any(arch in _PREVIOUSLY_SUPPORTED_MODELS for arch in architectures): + previous_version = _PREVIOUSLY_SUPPORTED_MODELS[architectures[0]] + raise ValueError( + f"Model architecture {architectures[0]} was supported" + f" in vLLM until version {previous_version}, and is " + "not supported anymore. Please use an older version" + " of vLLM if you want to use this model architecture.") + if (model_config.model_impl == ModelImpl.TRANSFORMERS or model_config.model_impl == ModelImpl.AUTO and vllm_not_supported): architectures = resolve_transformers_arch(model_config, architectures) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 9d88b5fe82cf..100532943c2b 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -276,6 +276,8 @@ sys.executable, "-m", "vllm.model_executor.models.registry" ] +_PREVIOUSLY_SUPPORTED_MODELS = {"Phi3SmallForCausalLM": "0.9.2"} + @dataclass(frozen=True) class _ModelInfo: From 0790c5e5820f17d1ec921ea00a805d37a10707fd Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Wed, 23 Jul 2025 09:30:05 +0200 Subject: [PATCH 50/63] [Bugfix] ensure tool_choice is popped when `tool_choice:null` is passed in json payload (#19679) Signed-off-by: Guillaume Calmettes Signed-off-by: qizixi --- vllm/entrypoints/openai/protocol.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 95e5bcd3bae1..6c6ec207a3ca 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -841,7 +841,7 @@ def check_tool_usage(cls, data): return data # if "tool_choice" is specified -- validation - if "tool_choice" in data: + if "tool_choice" in data and data["tool_choice"] is not None: # ensure that if "tool choice" is specified, tools are present if "tools" not in data or data["tools"] is None: @@ -853,7 +853,7 @@ def check_tool_usage(cls, data): if data["tool_choice"] not in [ "auto", "required" ] and not isinstance(data["tool_choice"], dict): - raise NotImplementedError( + raise ValueError( f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\ 'Only named tools, "none", "auto" or "required" '\ 'are supported.' From 8c5ed35102a969717dbd329c698368316da2440e Mon Sep 17 00:00:00 2001 From: Sergio Paniego Blanco Date: Wed, 23 Jul 2025 10:18:54 +0200 Subject: [PATCH 51/63] Fixed typo in profiling logs (#21441) Signed-off-by: qizixi --- vllm/multimodal/profiling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index cdec783ef9cf..7f6fb47a21fa 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -275,7 +275,7 @@ def get_mm_max_tokens( if total_mm_tokens > seq_len: logger.warning_once( "The sequence length (%d) is smaller than the pre-defined" - " wosrt-case total number of multimodal tokens (%d). " + " worst-case total number of multimodal tokens (%d). " "This may cause certain multi-modal inputs to fail during " "inference. To avoid this, you should increase " "`max_model_len` or reduce `mm_counts`.", From 5d860d963aa9bc644ed96a553474b2b39c2eed88 Mon Sep 17 00:00:00 2001 From: Michael Yao Date: Wed, 23 Jul 2025 16:23:20 +0800 Subject: [PATCH 52/63] [Docs] Fix bullets and grammars in tool_calling.md (#21440) Signed-off-by: windsonsea Signed-off-by: qizixi --- docs/features/tool_calling.md | 66 +++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 8d89dc4c8d8e..ce74683a1620 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -1,10 +1,10 @@ # Tool Calling -vLLM currently supports named function calling, as well as the `auto`, `required` (as of `vllm>=0.8.3`) and `none` options for the `tool_choice` field in the chat completion API. +vLLM currently supports named function calling, as well as the `auto`, `required` (as of `vllm>=0.8.3`), and `none` options for the `tool_choice` field in the chat completion API. ## Quickstart -Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8B model, so we need to use the llama3 tool calling chat template from the vLLM examples directory: +Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8B model, so we need to use the `llama3_json` tool calling chat template from the vLLM examples directory: ```bash vllm serve meta-llama/Llama-3.1-8B-Instruct \ @@ -13,7 +13,7 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \ --chat-template examples/tool_chat_template_llama3.1_json.jinja ``` -Next, make a request to the model that should result in it using the available tools: +Next, make a request that triggers the model to use the available tools: ??? code @@ -73,7 +73,7 @@ This example demonstrates: You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests. -Remember that it's the callers responsibility to: +Remember that it's the caller's responsibility to: 1. Define appropriate tools in the request 2. Include relevant context in the chat messages @@ -84,7 +84,7 @@ For more advanced usage, including parallel tool calls and different model-speci ## Named Function Calling vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is -enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a +enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a high-quality one. vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. @@ -95,7 +95,7 @@ specify the `name` of one of the tools in the `tool_choice` parameter of the cha ## Required Function Calling -vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine. +vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine. When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter. @@ -109,16 +109,16 @@ However, when `tool_choice='none'` is specified, vLLM includes tool definitions To enable this feature, you should set the following flags: -* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it +* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. It tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. * `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers -will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`. +will continue to be added in the future. You can also register your own tool parsers in the `--tool-parser-plugin`. * `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`. -* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages +* `--chat-template` -- **optional** for auto tool choice. It's the path to the chat template which handles `tool`-role messages and `assistant`-role messages that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their `tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates) -from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json) +from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json). If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! @@ -130,7 +130,7 @@ All Nous Research Hermes-series models newer than Hermes 2 Pro should be support * `NousResearch/Hermes-2-Theta-*` * `NousResearch/Hermes-3-*` -_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge +_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality and capabilities due to the merge step in their creation_. Flags: `--tool-call-parser hermes` @@ -146,13 +146,13 @@ Known issues: 1. Mistral 7B struggles to generate parallel tool calls correctly. 2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is -much shorter than what vLLM generates. Since an exception is thrown when this condition -is not met, the following additional chat templates are provided: + much shorter than what vLLM generates. Since an exception is thrown when this condition + is not met, the following additional chat templates are provided: -* - this is the "official" Mistral chat template, but tweaked so that -it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits) -* - this is a "better" version that adds a tool-use system prompt -when tools are provided, that results in much better reliability when working with parallel tool calling. + * - this is the "official" Mistral chat template, but tweaked so that + it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits) + * - this is a "better" version that adds a tool-use system prompt + when tools are provided, that results in much better reliability when working with parallel tool calling. Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` @@ -166,17 +166,17 @@ All Llama 3.1, 3.2 and 4 models should be supported. * `meta-llama/Llama-3.2-*` * `meta-llama/Llama-4-*` -The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for llama 4 models, it is recommended to use the `llama4_pythonic` tool parser. +The tool calling that is supported is the [JSON-based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for Llama 4 models, it is recommended to use the `llama4_pythonic` tool parser. Other tool calling formats like the built in python tool calling or custom tool calling are not supported. Known issues: -1. Parallel tool calls are not supported for llama 3, but it is supported in llama 4 models. -2. The model can generate parameters with a wrong format, such as generating +1. Parallel tool calls are not supported for Llama 3, but it is supported in Llama 4 models. +2. The model can generate parameters in an incorrect format, such as generating an array serialized as string instead of an array. -VLLM provides two JSON based chat templates for Llama 3.1 and 3.2: +VLLM provides two JSON-based chat templates for Llama 3.1 and 3.2: * - this is the "official" chat template for the Llama 3.1 models, but tweaked so that it works better with vLLM. @@ -185,7 +185,8 @@ images. Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}` -VLLM also provides a pythonic and JSON based chat template for Llama 4, but pythonic tool calling is recommended: +VLLM also provides a pythonic and JSON-based chat template for Llama 4, but pythonic tool calling is recommended: + * - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models. For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`. @@ -196,21 +197,21 @@ Supported models: * `ibm-granite/granite-3.0-8b-instruct` -Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja` + Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja` -: this is a modified chat template from the original on Huggingface. Parallel function calls are supported. + : this is a modified chat template from the original on Hugging Face. Parallel function calls are supported. * `ibm-granite/granite-3.1-8b-instruct` -Recommended flags: `--tool-call-parser granite` + Recommended flags: `--tool-call-parser granite` -The chat template from Huggingface can be used directly. Parallel function calls are supported. + The chat template from Huggingface can be used directly. Parallel function calls are supported. * `ibm-granite/granite-20b-functioncalling` -Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja` + Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja` -: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. + : this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. ### InternLM Models (`internlm`) @@ -246,10 +247,12 @@ The xLAM tool parser is designed to support models that generate tool calls in v Parallel function calls are supported, and the parser can effectively separate text content from tool calls. Supported models: + * Salesforce Llama-xLAM models: `Salesforce/Llama-xLAM-2-8B-fc-r`, `Salesforce/Llama-xLAM-2-70B-fc-r` * Qwen-xLAM models: `Salesforce/xLAM-1B-fc-r`, `Salesforce/xLAM-3B-fc-r`, `Salesforce/Qwen-xLAM-32B-fc-r` Flags: + * For Llama-based xLAM models: `--tool-call-parser xlam --chat-template examples/tool_chat_template_xlam_llama.jinja` * For Qwen-based xLAM models: `--tool-call-parser xlam --chat-template examples/tool_chat_template_xlam_qwen.jinja` @@ -292,9 +295,10 @@ Flags: `--tool-call-parser kimi_k2` Supported models: -* `tencent/Hunyuan-A13B-Instruct` (chat template already included huggingface model file.) +* `tencent/Hunyuan-A13B-Instruct` (The chat template is already included in the Hugging Face model files.) Flags: + * For non-reasoning: `--tool-call-parser hunyuan_a13b` * For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning` @@ -325,9 +329,9 @@ Example supported models: Flags: `--tool-call-parser pythonic --chat-template {see_above}` !!! warning - Llama's smaller models frequently fail to emit tool calls in the correct format. Your mileage may vary. + Llama's smaller models frequently fail to emit tool calls in the correct format. Results may vary depending on the model. -## How to write a tool parser plugin +## How to Write a Tool Parser Plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in . From c8ea28a5d8b089b92880a2107cfdd997204b013f Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Wed, 23 Jul 2025 01:39:25 -0700 Subject: [PATCH 53/63] [Sampler] Introduce logprobs mode for logging (#21398) Signed-off-by: Lu Fang Signed-off-by: qizixi --- tests/v1/sample/test_logprobs.py | 43 ++++++++++++++++++++++++++++++ vllm/config.py | 9 +++++++ vllm/engine/arg_utils.py | 18 ++++++++----- vllm/v1/sample/sampler.py | 17 ++++++++++-- vllm/v1/sample/tpu/sampler.py | 1 + vllm/v1/worker/gpu_input_batch.py | 4 +-- vllm/v1/worker/gpu_model_runner.py | 2 +- 7 files changed, 82 insertions(+), 12 deletions(-) diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 4f1f340a4ccb..680e2ce98bb2 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -12,6 +12,7 @@ assert_incr_detok_str_matches_non_incr_detok_str, compute_correct_cumulative_logprob, get_test_batch) from vllm import SamplingParams +from vllm.config import LogprobsMode from ...conftest import HfRunner, VllmRunner @@ -426,3 +427,45 @@ def test_zero_logprobs(vllm_model, example_prompts, # prompt token assert prompt_logprobs is not None assert len(prompt_token_ids) == len(prompt_logprobs) + + +@pytest.mark.parametrize( + "logprobs_mode", + ["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"]) +def test_logprobs_mode(logprobs_mode: LogprobsMode, + monkeypatch: pytest.MonkeyPatch): + """Test with LLM engine with different logprobs_mode. + For logprobs, we should have non-positive values. + For logits, we should expect at least one positive values. + """ + from vllm import LLM + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + llm = LLM( + "facebook/opt-125m", + max_logprobs=5, + enable_prefix_caching=False, + # 2 other llms alive during whole session + gpu_memory_utilization=0.05, + max_model_len=16, + logprobs_mode=logprobs_mode) + vllm_sampling_params = SamplingParams(logprobs=1) + results = llm.generate(["Hello world"], + sampling_params=vllm_sampling_params) + + total_token_with_logprobs = 0 + positive_values = 0 + for output in results[0].outputs: + for logprobs in output.logprobs: + for token_id in logprobs: + logprob = logprobs[token_id] + if "logprobs" in logprobs_mode: + assert logprob.logprob <= 0 + if logprob.logprob > 0: + positive_values = positive_values + 1 + total_token_with_logprobs = total_token_with_logprobs + 1 + assert total_token_with_logprobs >= len(results[0].outputs) + if "logits" in logprobs_mode: + assert positive_values > 0 + del llm diff --git a/vllm/config.py b/vllm/config.py index 6623a48f839a..223c1968c275 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -219,6 +219,8 @@ def is_init_field(cls: ConfigType, name: str) -> bool: TokenizerMode = Literal["auto", "slow", "mistral", "custom"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] +LogprobsMode = Literal["raw_logprobs", "raw_logits", "processed_logprobs", + "processed_logits"] @config @@ -316,6 +318,13 @@ class ModelConfig: """Maximum number of log probabilities to return when `logprobs` is specified in `SamplingParams`. The default value comes the default for the OpenAI Chat Completions API.""" + logprobs_mode: LogprobsMode = "raw_logprobs" + """Indicates the content returned in the logprobs and prompt_logprobs. + Supported mode: + 1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits. + Raw means the values before applying logit processors, like bad words. + Processed means the values after applying such processors. + """ disable_sliding_window: bool = False """Whether to disable sliding window. If True, we will disable the sliding window functionality of the model, capping to sliding window size. If the diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1e3d46a8d96e..4a5efd40241d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -26,13 +26,13 @@ DetailedTraceModules, Device, DeviceConfig, DistributedExecutorBackend, GuidedDecodingBackend, GuidedDecodingBackendV1, HfOverrides, KVEventsConfig, - KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig, - ModelConfig, ModelDType, ModelImpl, MultiModalConfig, - ObservabilityConfig, ParallelConfig, PoolerConfig, - PrefixCachingHashAlgo, PromptAdapterConfig, - SchedulerConfig, SchedulerPolicy, SpeculativeConfig, - TaskOption, TokenizerMode, VllmConfig, get_attr_docs, - get_field) + KVTransferConfig, LoadConfig, LoadFormat, + LogprobsMode, LoRAConfig, ModelConfig, ModelDType, + ModelImpl, MultiModalConfig, ObservabilityConfig, + ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, + PromptAdapterConfig, SchedulerConfig, SchedulerPolicy, + SpeculativeConfig, TaskOption, TokenizerMode, + VllmConfig, get_attr_docs, get_field) from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins @@ -324,6 +324,7 @@ class EngineArgs: SchedulerConfig.long_prefill_token_threshold max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs max_logprobs: int = ModelConfig.max_logprobs + logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode disable_log_stats: bool = False revision: Optional[str] = ModelConfig.revision code_revision: Optional[str] = ModelConfig.code_revision @@ -490,6 +491,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **model_kwargs["max_seq_len_to_capture"]) model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"]) + model_group.add_argument("--logprobs-mode", + **model_kwargs["logprobs_mode"]) model_group.add_argument("--disable-sliding-window", **model_kwargs["disable_sliding_window"]) model_group.add_argument("--disable-cascade-attn", @@ -892,6 +895,7 @@ def create_model_config(self) -> ModelConfig: enforce_eager=self.enforce_eager, max_seq_len_to_capture=self.max_seq_len_to_capture, max_logprobs=self.max_logprobs, + logprobs_mode=self.logprobs_mode, disable_sliding_window=self.disable_sliding_window, disable_cascade_attn=self.disable_cascade_attn, skip_tokenizer_init=self.skip_tokenizer_init, diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index fa078e628768..82f51298f1b5 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -5,6 +5,7 @@ import torch import torch.nn as nn +from vllm.config import LogprobsMode from vllm.utils import is_pin_memory_available from vllm.v1.outputs import LogprobsTensors, SamplerOutput from vllm.v1.sample.metadata import SamplingMetadata @@ -18,10 +19,11 @@ class Sampler(nn.Module): - def __init__(self): + def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs"): super().__init__() self.topk_topp_sampler = TopKTopPSampler() self.pin_memory = is_pin_memory_available() + self.logprobs_mode = logprobs_mode def forward( self, @@ -36,7 +38,10 @@ def forward( # See https://vllm-dev.slack.com/archives/C07UUL8E61Z/p1735907856007919 # noqa: E501 num_logprobs = sampling_metadata.max_num_logprobs if num_logprobs is not None: - raw_logprobs = self.compute_logprobs(logits) + if self.logprobs_mode == "raw_logprobs": + raw_logprobs = self.compute_logprobs(logits) + elif self.logprobs_mode == "raw_logits": + raw_logprobs = logits.clone() # Use float32 for the logits. logits = logits.to(torch.float32) @@ -51,6 +56,14 @@ def forward( # Apply penalties (e.g., min_tokens, freq_penalties). logits = self.apply_penalties(logits, sampling_metadata) + + # Get the process logprobs or logits. + if num_logprobs is not None: + if self.logprobs_mode == "processed_logprobs": + raw_logprobs = self.compute_logprobs(logits) + elif self.logprobs_mode == "processed_logits": + raw_logprobs = logits.clone() + # Sample the next token. sampled = self.sample(logits, sampling_metadata) # Convert sampled token ids to int64 (long) type to ensure compatibility diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py index 1056eb1d7b7f..2c9f4892bc24 100644 --- a/vllm/v1/sample/tpu/sampler.py +++ b/vllm/v1/sample/tpu/sampler.py @@ -15,6 +15,7 @@ class Sampler(nn.Module): def __init__(self): + # TODO(houseroad): Add support for logprobs_mode. super().__init__() self.topk_topp_sampler = TopKTopPSampler() diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index a242c7fca5ef..c63041600f38 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -389,7 +389,7 @@ def add_request( def remove_request(self, req_id: str) -> Optional[int]: """This method must always be followed by a call to condense(). - + Args: req_id: request to remove @@ -590,7 +590,7 @@ def condense(self) -> None: def refresh_metadata(self): """Apply batch updates, reset input batch at end of step - + * Apply batch add/remove/permute to logits procs' states * If batch state is modified, update sampling metadata """ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d98bcf06cc70..aa9e49eb90dc 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -151,7 +151,7 @@ def __init__( self.encoder_cache_size = encoder_cache_size # Sampler - self.sampler = Sampler() + self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode) self.eplb_state: Optional[EplbState] = None """ From 54e6fce076e3525db7c345e1feb4f6769d09ba2a Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Wed, 23 Jul 2025 04:40:27 -0400 Subject: [PATCH 54/63] Mamba V2 Test not Asserting Failures. (#21379) Signed-off-by: Yu Chin Fabian Lim Signed-off-by: qizixi --- tests/kernels/mamba/test_mamba_mixer2.py | 9 ++++---- tests/kernels/mamba/test_mamba_ssm_ssd.py | 26 +++++++++++++++++------ 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py index f5c6a18614ff..16c310726ad1 100644 --- a/tests/kernels/mamba/test_mamba_mixer2.py +++ b/tests/kernels/mamba/test_mamba_mixer2.py @@ -119,7 +119,8 @@ def mixer2_gated_norm_tensor_parallel( gate_states[..., local_rank * N:(local_rank + 1) * N], ) ref_output = mixer_single_gpu(hidden_states, gate_states) - torch.allclose(output, - ref_output[..., local_rank * N:(local_rank + 1) * N], - atol=1e-3, - rtol=1e-3) + torch.testing.assert_close(output, + ref_output[..., + local_rank * N:(local_rank + 1) * N], + atol=5e-3, + rtol=1e-3) diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index 6a3f21ba543f..00c1a2911d7d 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -193,6 +193,13 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, # this tests the kernels on a single example (no batching) + # TODO: the bfloat16 case requires higher thresholds. To be investigated + + if itype == torch.bfloat16: + atol, rtol = 5e-2, 5e-2 + else: + atol, rtol = 8e-3, 5e-3 + # set seed batch_size = 1 # batch_size # ssd_minimal_discrete requires chunk_size divide seqlen @@ -216,14 +223,14 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, return_final_states=True) # just test the last in sequence - torch.allclose(Y[:, -1], Y_min[:, -1], atol=1e-3, rtol=1e-3) + torch.testing.assert_close(Y[:, -1], Y_min[:, -1], atol=atol, rtol=rtol) # just test the last head # NOTE, in the kernel we always cast states to fp32 - torch.allclose(final_state[:, -1], - final_state_min[:, -1].to(torch.float32), - atol=1e-3, - rtol=1e-3) + torch.testing.assert_close(final_state[:, -1], + final_state_min[:, -1].to(torch.float32), + atol=atol, + rtol=rtol) @pytest.mark.parametrize("itype", [torch.float32, torch.float16]) @@ -263,6 +270,13 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, seqlen, chunk_size, num_examples, cases = seq_len_chunk_size_cases + # TODO: the irregular chunk size cases have some issues and require higher + # tolerance. This is to be invesigated + if chunk_size not in {8, 256}: + atol, rtol = 5e-1, 5e-1 + else: + atol, rtol = 5e-3, 5e-3 + # hold state during the cutting process so we know if an # example has been exhausted and needs to cycle last_taken: dict = {} # map: eg -> pointer to last taken sample @@ -300,7 +314,7 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, # just test one dim and dstate Y_eg = Y[0, cu_seqlens[i]:cu_seqlens[i + 1], 0, 0] Y_min_eg = Y_min[i][:, 0, 0] - torch.allclose(Y_eg, Y_min_eg, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(Y_eg, Y_min_eg, atol=atol, rtol=rtol) # update states states = new_states From 507f651198a65dd83bb4d2ff01e1173ad44596e0 Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Wed, 23 Jul 2025 01:41:43 -0700 Subject: [PATCH 55/63] [Misc] fixed nvfp4_moe test failures due to invalid kwargs (#21246) Signed-off-by: Yang Chen Signed-off-by: qizixi --- tests/kernels/moe/test_nvfp4_moe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index 3f5412e75821..3ff385360299 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -93,11 +93,11 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, a1_gscale=a1_gs, w1_fp4=w1_q, w1_blockscale=w1_blockscale, - w1_alphas=(1 / w1_gs), + g1_alphas=(1 / w1_gs), a2_gscale=a2_gs, w2_fp4=w2_q, w2_blockscale=w2_blockscale, - w2_alphas=(1 / w2_gs), + g2_alphas=(1 / w2_gs), topk_weights=topk_weights, topk_ids=topk_ids, m=m, From d698fd2cf8937b61532b86863aa9cd1913f5da9a Mon Sep 17 00:00:00 2001 From: Michael Yao Date: Wed, 23 Jul 2025 18:37:25 +0800 Subject: [PATCH 56/63] [Docs] Clean up v1/metrics.md (#21449) Signed-off-by: windsonsea Signed-off-by: qizixi --- docs/design/v1/metrics.md | 165 +++++++++++++++++--------------------- 1 file changed, 73 insertions(+), 92 deletions(-) diff --git a/docs/design/v1/metrics.md b/docs/design/v1/metrics.md index e23308f2637c..52cd320dd4e1 100644 --- a/docs/design/v1/metrics.md +++ b/docs/design/v1/metrics.md @@ -5,17 +5,17 @@ Ensure the v1 LLM Engine exposes a superset of the metrics available in v0. ## Objectives - Achieve parity of metrics between v0 and v1. -- The priority use case is accessing these metrics via Prometheus as this is what we expect to be used in production environments. -- Logging support - i.e. printing metrics to the info log - is provided for more ad-hoc testing, debugging, development, and exploratory use cases. +- The priority use case is accessing these metrics via Prometheus, as this is what we expect to be used in production environments. +- Logging support (i.e. printing metrics to the info log) is provided for more ad-hoc testing, debugging, development, and exploratory use cases. ## Background Metrics in vLLM can be categorized as follows: -1. Server-level metrics: these are global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus. -2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking. +1. Server-level metrics: Global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus. +2. Request-level metrics: Metrics that track the characteristics (e.g. size and timing) of individual requests. These are typically exposed as Histograms in Prometheus and are often the SLOs that an SRE monitoring vLLM will be tracking. -The mental model is that the "Server-level Metrics" explain why the "Request-level Metrics" are what they are. +The mental model is that server-level metrics help explain the values of request-level metrics. ### v0 Metrics @@ -65,20 +65,20 @@ vLLM also provides [a reference example](../../examples/online_serving/prometheu The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important: -- `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds -- `vllm:prompt_tokens_total` - Prompt Tokens -- `vllm:generation_tokens_total` - Generation Tokens -- `vllm:time_per_output_token_seconds` - Inter token latency (Time Per Output Token, TPOT) in second. +- `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds. +- `vllm:prompt_tokens_total` - Prompt tokens. +- `vllm:generation_tokens_total` - Generation tokens. +- `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds. - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds. -- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in RUNNING, WAITING, and SWAPPED state +- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states. - `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM. -- `vllm:request_prompt_tokens` - Request prompt length -- `vllm:request_generation_tokens` - request generation length -- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached -- `vllm:request_queue_time_seconds` - Queue Time -- `vllm:request_prefill_time_seconds` - Requests Prefill Time -- `vllm:request_decode_time_seconds` - Requests Decode Time -- `vllm:request_max_num_generation_tokens` - Max Generation Token in Sequence Group +- `vllm:request_prompt_tokens` - Request prompt length. +- `vllm:request_generation_tokens` - Request generation length. +- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached. +- `vllm:request_queue_time_seconds` - Queue time. +- `vllm:request_prefill_time_seconds` - Requests prefill time. +- `vllm:request_decode_time_seconds` - Requests decode time. +- `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group. See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here. @@ -103,7 +103,7 @@ In v0, metrics are collected in the engine core process and we use multi-process ### Built in Python/Process Metrics -The following metrics are supported by default by `prometheus_client`, but the are not exposed with multiprocess mode is used: +The following metrics are supported by default by `prometheus_client`, but they are not exposed when multi-process mode is used: - `python_gc_objects_collected_total` - `python_gc_objects_uncollectable_total` @@ -158,6 +158,7 @@ In v1, we wish to move computation and overhead out of the engine core process to minimize the time between each forward pass. The overall idea of V1 EngineCore design is: + - EngineCore is the inner loop. Performance is most critical here - AsyncLLM is the outer loop. This is overlapped with GPU execution (ideally), so this is where any "overheads" should be if @@ -178,7 +179,7 @@ time" (`time.time()`) to calculate intervals as the former is unaffected by system clock changes (e.g. from NTP). It's also important to note that monotonic clocks differ between -processes - each process has its own reference. point. So it is +processes - each process has its own reference point. So it is meaningless to compare monotonic timestamps from different processes. Therefore, in order to calculate an interval, we must compare two @@ -343,14 +344,15 @@ vllm:time_to_first_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3. vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0 ``` -Note - the choice of histogram buckets to be most useful to users -across a broad set of use cases is not straightforward and will -require refinement over time. +!!! note + The choice of histogram buckets to be most useful to users + across a broad set of use cases is not straightforward and will + require refinement over time. ### Cache Config Info -`prometheus_client` has support for [Info -metrics](https://prometheus.github.io/client_python/instrumenting/info/) +`prometheus_client` has support for +[Info metrics](https://prometheus.github.io/client_python/instrumenting/info/) which are equivalent to a `Gauge` whose value is permanently set to 1, but exposes interesting key/value pair information via labels. This is used for information about an instance that does not change - so it @@ -363,14 +365,11 @@ We use this concept for the `vllm:cache_config_info` metric: # HELP vllm:cache_config_info Information of the LLMEngine CacheConfig # TYPE vllm:cache_config_info gauge vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="False",cpu_offload_gb="0",enable_prefix_caching="False",gpu_memory_utilization="0.9",...} 1.0 - ``` -However, `prometheus_client` has [never supported Info metrics in -multiprocessing -mode](https://github.com/prometheus/client_python/pull/300) - for -[unclear -reasons](gh-pr:7279#discussion_r1710417152). We +However, `prometheus_client` has +[never supported Info metrics in multiprocessing mode](https://github.com/prometheus/client_python/pull/300) - +for [unclear reasons](gh-pr:7279#discussion_r1710417152). We simply use a `Gauge` metric set to 1 and `multiprocess_mode="mostrecent"` instead. @@ -395,11 +394,9 @@ distinguish between per-adapter counts. This should be revisited. Note that `multiprocess_mode="livemostrecent"` is used - the most recent metric is used, but only from currently running processes. -This was added in - and there is -[at least one known -user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54). If -we revisit this design and deprecate the old metric, we should reduce +This was added in and there is +[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54). +If we revisit this design and deprecate the old metric, we should reduce the need for a significant deprecation period by making the change in v0 also and asking this project to move to the new metric. @@ -442,23 +439,20 @@ suddenly (from their perspective) when it is removed, even if there is an equivalent metric for them to use. As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was -[deprecated](gh-pr:2764) (with a -comment in the code), -[removed](gh-pr:12383), and then -[noticed by a -user](gh-issue:13218). +[deprecated](gh-pr:2764) (with a comment in the code), +[removed](gh-pr:12383), and then [noticed by a user](gh-issue:13218). In general: -1) We should be cautious about deprecating metrics, especially since +1. We should be cautious about deprecating metrics, especially since it can be hard to predict the user impact. -2) We should include a prominent deprecation notice in the help string +2. We should include a prominent deprecation notice in the help string that is included in the `/metrics' output. -3) We should list deprecated metrics in user-facing documentation and +3. We should list deprecated metrics in user-facing documentation and release notes. -4) We should consider hiding deprecated metrics behind a CLI argument - in order to give administrators [an escape - hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics) +4. We should consider hiding deprecated metrics behind a CLI argument + in order to give administrators + [an escape hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics) for some time before deleting them. See the [deprecation policy](../../contributing/deprecation_policy.md) for @@ -474,7 +468,7 @@ removed. The `vllm:time_in_queue_requests` Histogram metric was added by and its calculation is: -``` +```python self.metrics.first_scheduled_time = now self.metrics.time_in_queue = now - self.metrics.arrival_time ``` @@ -482,7 +476,7 @@ The `vllm:time_in_queue_requests` Histogram metric was added by Two weeks later, added `vllm:request_queue_time_seconds` leaving us with: -``` +```python if seq_group.is_finished(): if (seq_group.metrics.first_scheduled_time is not None and seq_group.metrics.first_token_time is not None): @@ -517,8 +511,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU memory. This is also known as "KV cache offloading" and is configured with `--swap-space` and `--preemption-mode`. -In v0, [vLLM has long supported beam -search](gh-issue:6226). The +In v0, [vLLM has long supported beam search](gh-issue:6226). The SequenceGroup encapsulated the idea of N Sequences which all shared the same prompt kv blocks. This enabled KV cache block sharing between requests, and copy-on-write to do branching. CPU @@ -530,9 +523,8 @@ option than CPU swapping since blocks can be evicted slowly on demand and the part of the prompt that was evicted can be recomputed. SequenceGroup was removed in V1, although a replacement will be -required for "parallel sampling" (`n>1`). [Beam search was moved out of -the core (in -V0)](gh-issue:8306). There was a +required for "parallel sampling" (`n>1`). +[Beam search was moved out of the core (in V0)](gh-issue:8306). There was a lot of complex code for a very uncommon feature. In V1, with prefix caching being better (zero over head) and therefore @@ -547,18 +539,18 @@ Some v0 metrics are only relevant in the context of "parallel sampling". This is where the `n` parameter in a request is used to request multiple completions from the same prompt. -As part of adding parallel sampling support in we should +As part of adding parallel sampling support in , we should also add these metrics. - `vllm:request_params_n` (Histogram) -Observes the value of the 'n' parameter of every finished request. + Observes the value of the 'n' parameter of every finished request. - `vllm:request_max_num_generation_tokens` (Histogram) -Observes the maximum output length of all sequences in every finished -sequence group. In the absence of parallel sampling, this is -equivalent to `vllm:request_generation_tokens`. + Observes the maximum output length of all sequences in every finished + sequence group. In the absence of parallel sampling, this is + equivalent to `vllm:request_generation_tokens`. ### Speculative Decoding @@ -576,26 +568,23 @@ There is a PR under review () to add "prompt lookup (ngram)" seculative decoding to v1. Other techniques will follow. We should revisit the v0 metrics in this context. -Note - we should probably expose acceptance rate as separate accepted -and draft counters, like we do for prefix caching hit rate. Efficiency -likely also needs similar treatment. +!!! note + We should probably expose acceptance rate as separate accepted + and draft counters, like we do for prefix caching hit rate. Efficiency + likely also needs similar treatment. ### Autoscaling and Load-balancing A common use case for our metrics is to support automated scaling of vLLM instances. -For related discussion from the [Kubernetes Serving Working -Group](https://github.com/kubernetes/community/tree/master/wg-serving), +For related discussion from the +[Kubernetes Serving Working Group](https://github.com/kubernetes/community/tree/master/wg-serving), see: -- [Standardizing Large Model Server Metrics in - Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk) -- [Benchmarking LLM Workloads for Performance Evaluation and - Autoscaling in - Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ) -- [Inference - Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf) +- [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk) +- [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ) +- [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf) - and . This is a non-trivial topic. Consider this comment from Rob: @@ -619,19 +608,16 @@ should judge an instance as approaching saturation: Our approach to naming metrics probably deserves to be revisited: -1. The use of colons in metric names seems contrary to ["colons are - reserved for user defined recording - rules"](https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels) +1. The use of colons in metric names seems contrary to + ["colons are reserved for user defined recording rules"](https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels). 2. Most of our metrics follow the convention of ending with units, but not all do. 3. Some of our metric names end with `_total`: -``` -If there is a suffix of `_total` on the metric name, it will be removed. When -exposing the time series for counter, a `_total` suffix will be added. This is -for compatibility between OpenMetrics and the Prometheus text format, as OpenMetrics -requires the `_total` suffix. -``` + If there is a suffix of `_total` on the metric name, it will be removed. When + exposing the time series for counter, a `_total` suffix will be added. This is + for compatibility between OpenMetrics and the Prometheus text format, as OpenMetrics + requires the `_total` suffix. ### Adding More Metrics @@ -642,8 +628,7 @@ There is no shortage of ideas for new metrics: - Proposals arising from specific use cases, like the Kubernetes auto-scaling topic above - Proposals that might arise out of standardisation efforts like - [OpenTelemetry Semantic Conventions for Gen - AI](https://github.com/open-telemetry/semantic-conventions/tree/main/docs/gen-ai). + [OpenTelemetry Semantic Conventions for Gen AI](https://github.com/open-telemetry/semantic-conventions/tree/main/docs/gen-ai). We should be cautious in our approach to adding new metrics. While metrics are often relatively straightforward to add: @@ -668,18 +653,14 @@ fall under the more general heading of "Observability". v0 has support for OpenTelemetry tracing: - Added by -- Configured with `--oltp-traces-endpoint` and - `--collect-detailed-traces` -- [OpenTelemetry blog - post](https://opentelemetry.io/blog/2024/llm-observability/) +- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces` +- [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/) - [User-facing docs](../../examples/online_serving/opentelemetry.md) -- [Blog - post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f) -- [IBM product - docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview) +- [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f) +- [IBM product docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview) -OpenTelemetry has a [Gen AI Working -Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md). +OpenTelemetry has a +[Gen AI Working Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md). Since metrics is a big enough topic on its own, we are going to tackle the topic of tracing in v1 separately. @@ -698,7 +679,7 @@ These metrics are only enabled when OpenTelemetry tracing is enabled and if `--collect-detailed-traces=all/model/worker` is used. The documentation for this option states: -> collect detailed traces for the specified "modules. This involves +> collect detailed traces for the specified modules. This involves > use of possibly costly and or blocking operations and hence might > have a performance impact. From 169cb7847eec32c2549dce98eaf129b725da2270 Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 23 Jul 2025 18:54:08 +0800 Subject: [PATCH 57/63] [Model] add Hunyuan V1 Dense Model support. (#21368) Signed-off-by: Asher Zhang Signed-off-by: qizixi --- docs/models/supported_models.md | 1 + tests/models/registry.py | 2 + .../{hunyuan_v1_moe.py => hunyuan_v1.py} | 70 ++++++++++++++----- vllm/model_executor/models/registry.py | 3 +- 4 files changed, 57 insertions(+), 19 deletions(-) rename vllm/model_executor/models/{hunyuan_v1_moe.py => hunyuan_v1.py} (95%) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index bbb52f035c72..c8b6c6c86120 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -363,6 +363,7 @@ th { | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ | | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | +| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ | | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ | | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 1eb7f7b9d829..84ca0bc60003 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -199,6 +199,8 @@ def check_available_online( trust_remote_code=True), "HunYuanMoEV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-A13B-Instruct", trust_remote_code=True), + "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124", + trust_remote_code=True), "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b", trust_remote_code=True), "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b", diff --git a/vllm/model_executor/models/hunyuan_v1_moe.py b/vllm/model_executor/models/hunyuan_v1.py similarity index 95% rename from vllm/model_executor/models/hunyuan_v1_moe.py rename to vllm/model_executor/models/hunyuan_v1.py index b3baec98b0fc..fbba849a76f2 100644 --- a/vllm/model_executor/models/hunyuan_v1_moe.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -61,6 +61,19 @@ make_layers) +def _is_moe(config: PretrainedConfig) -> bool: + num_experts = getattr(config, "num_experts", None) + if isinstance(num_experts, int): + return num_experts > 1 + if isinstance(num_experts, list) and num_experts: + # Ensure all elements are integers before calling max. + if all(isinstance(e, int) for e in num_experts): + return max(num_experts) > 1 + else: + return False + return False + + def _get_cla_factor(config: PretrainedConfig) -> int: if not getattr(config, "use_cla", False): return 1 @@ -140,8 +153,8 @@ def __init__( # the KV heads across multiple tensor parallel GPUs. assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - # MistralConfig has an optional head_dim introduced by Mistral-Nemo - if hasattr(config, "head_dim"): + + if hasattr(config, "head_dim") and config.head_dim: self.head_dim = config.head_dim elif hasattr(config, "attention_head_dim"): self.head_dim = config.attention_head_dim @@ -490,12 +503,23 @@ def __init__( else: raise RuntimeError(f"Unsupported attention type: {attention_type}") - self.mlp = HunYuanSparseMoeBlock( - config=config, - quant_config=quant_config, - layer_id=layer_id, - prefix=f"{prefix}.mlp", - ) + if _is_moe(config): + self.mlp = HunYuanSparseMoeBlock( + config=config, + quant_config=quant_config, + layer_id=layer_id, + prefix=f"{prefix}.mlp", + ) + else: + self.mlp = HunYuanMLP( + hidden_size=self.hidden_size, + intermediate_size=self.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + bias=getattr(config, "mlp_bias", False), + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, @@ -642,15 +666,17 @@ def _split_qkv_weight(self, qkv: torch.Tensor): return torch.concat((q, k, v)) def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - return FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=self.config.num_experts, - ) + if _is_moe(self.config): + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts, + ) + else: + return [] def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): cla_factor = _get_cla_factor(self.config) @@ -815,7 +841,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): return loaded_params -class HunYuanMoEV1ForCausalLM(nn.Module, SupportsLoRA): +class HunYuanV1Base(nn.Module, SupportsLoRA): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -901,3 +927,11 @@ def load_weights(self, weights: Iterable[tuple[str, def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: return self.model.get_expert_mapping() + + +class HunYuanDenseV1ForCausalLM(HunYuanV1Base): + pass + + +class HunYuanMoEV1ForCausalLM(HunYuanV1Base): + pass diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 100532943c2b..fafb6a704383 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -79,7 +79,8 @@ "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501 "GritLM": ("gritlm", "GritLM"), "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"), - "HunYuanMoEV1ForCausalLM": ("hunyuan_v1_moe", "HunYuanMoEV1ForCausalLM"), + "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"), + "HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"), "InternLMForCausalLM": ("llama", "LlamaForCausalLM"), "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"), From 94a63583ffc8dedb5779cfeb6814f1ae61d3c3b7 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 23 Jul 2025 20:53:26 +0800 Subject: [PATCH 58/63] [V1] Check all pooling tasks during profiling (#21299) Signed-off-by: DarkLight1337 Signed-off-by: qizixi --- vllm/sequence.py | 7 ++++ vllm/v1/worker/gpu_model_runner.py | 63 +++++++++++++++++++----------- 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 99208fbad65f..1f507add0d91 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1173,6 +1173,10 @@ class PoolingSequenceGroupOutput( # The actual type is in SequenceGroup.pooled_data data: Any + def get_data_nbytes(self) -> int: + data: torch.Tensor = self.data + return data.nbytes + def __repr__(self) -> str: return f"PoolingSequenceGroupOutput(data={self.data}" @@ -1234,6 +1238,9 @@ class PoolerOutput( """The output from a pooling operation in the pooling model.""" outputs: list[PoolingSequenceGroupOutput] + def get_data_nbytes(self) -> int: + return sum(o.get_data_nbytes() for o in self.outputs) + def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput: return self.outputs[idx] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index aa9e49eb90dc..6c91ba2324a2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -41,7 +41,7 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.pooling_params import PoolingParams, PoolingTask from vllm.sampling_params import SamplingType -from vllm.sequence import IntermediateTensors +from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size, is_pin_memory_available, round_up) @@ -1819,7 +1819,7 @@ def load_model(self, eep_scale_up: bool = False) -> None: old_global_expert_indices = None rank_mapping = None - with DeviceMemoryProfiler() as m: # noqa: SIM117 + with DeviceMemoryProfiler() as m: time_before_load = time.perf_counter() model_loader = get_model_loader(self.load_config) if not hasattr(self, "model"): @@ -2215,12 +2215,11 @@ def _dummy_sampler_run( ) return sampler_output - @torch.inference_mode() - def _dummy_pooler_run( + def _dummy_pooler_run_task( self, hidden_states: torch.Tensor, - ) -> torch.Tensor: - + task: PoolingTask, + ) -> PoolerOutput: num_tokens = hidden_states.shape[0] max_num_reqs = self.scheduler_config.max_num_seqs num_reqs = min(num_tokens, max_num_reqs) @@ -2232,37 +2231,55 @@ def _dummy_pooler_run( hidden_states_list = list( torch.split(hidden_states, num_scheduled_tokens_list)) - req_num_tokens = num_tokens // num_reqs - model = cast(VllmModelForPooling, self.model) - dummy_task = self.get_supported_pooling_tasks()[0] - dummy_pooling_params = PoolingParams(task=dummy_task) + dummy_prompt_lens = torch.tensor( + [h.shape[0] for h in hidden_states_list], + device=self.device, + ) + dummy_token_ids = torch.zeros((num_reqs, req_num_tokens), + dtype=torch.int32, + device=self.device) - to_update = model.pooler.get_pooling_updates(dummy_task) + model = cast(VllmModelForPooling, self.model) + dummy_pooling_params = PoolingParams(task=task) + to_update = model.pooler.get_pooling_updates(task) to_update.apply(dummy_pooling_params) dummy_metadata = PoolingMetadata( - prompt_lens=torch.tensor([h.shape[0] for h in hidden_states_list], - device=self.device), - prompt_token_ids=torch.zeros((num_reqs, req_num_tokens), - dtype=torch.int32, - device=self.device), - pooling_params=[dummy_pooling_params] * num_reqs) + prompt_lens=dummy_prompt_lens, + prompt_token_ids=dummy_token_ids, + pooling_params=[dummy_pooling_params] * num_reqs, + ) try: - pooler_output = model.pooler(hidden_states=hidden_states_list, - pooling_metadata=dummy_metadata) + return model.pooler(hidden_states=hidden_states_list, + pooling_metadata=dummy_metadata) except RuntimeError as e: if 'out of memory' in str(e): raise RuntimeError( - "CUDA out of memory occurred when warming up pooler with " - f"{num_reqs} dummy requests. Please try lowering " - "`max_num_seqs` or `gpu_memory_utilization` when " + "CUDA out of memory occurred when warming up pooler " + f"({task=}) with {num_reqs} dummy requests. Please try " + "lowering `max_num_seqs` or `gpu_memory_utilization` when " "initializing the engine.") from e else: raise e - return pooler_output + + @torch.inference_mode() + def _dummy_pooler_run( + self, + hidden_states: torch.Tensor, + ) -> PoolerOutput: + # Find the task that has the largest output for subsequent steps + output_size = dict[PoolingTask, float]() + for task in self.get_supported_pooling_tasks(): + # Run a full batch with each task to ensure none of them OOMs + output = self._dummy_pooler_run_task(hidden_states, task) + output_size[task] = output.get_data_nbytes() + del output # Allow GC + + max_task = max(output_size.items(), key=lambda x: x[1])[0] + return self._dummy_pooler_run_task(hidden_states, max_task) def profile_run(self) -> None: # Profile with multimodal encoder & encoder cache. From a1fb3aadb5bb3693752ad0d1890636dfe89476e0 Mon Sep 17 00:00:00 2001 From: Tao He Date: Wed, 23 Jul 2025 21:34:37 +0800 Subject: [PATCH 59/63] [Bugfix][Qwen][DCA] fixes bug in dual-chunk-flash-attn backend for qwen 1m models. (#21364) Signed-off-by: Tao He Signed-off-by: qizixi --- vllm/attention/backends/dual_chunk_flash_attn.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py index e108646e7ffb..fa6f3f1b39cc 100644 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ b/vllm/attention/backends/dual_chunk_flash_attn.py @@ -1055,7 +1055,6 @@ def _dual_chunk_flash_attn_prefill_func( v_states_intra, softmax_scale=softmax_scale, causal=True, - block_table=block_table, stage="intra", vertical_indices=vertical_buffer, slash_indices=slash_buffer, @@ -1070,7 +1069,6 @@ def _dual_chunk_flash_attn_prefill_func( v_states_intra, softmax_scale=softmax_scale, causal=True, - block_table=block_table, stage="intra", vertical_indices=intra_vertical_indices, slash_indices=intra_slash_indices, @@ -1085,7 +1083,6 @@ def _dual_chunk_flash_attn_prefill_func( v_states_succ, softmax_scale=softmax_scale, causal=False, - block_table=block_table, stage="succ", vertical_indices=succ_vertical_buffer, slash_indices=succ_slash_buffer, @@ -1100,7 +1097,6 @@ def _dual_chunk_flash_attn_prefill_func( v_states_succ, softmax_scale=softmax_scale, causal=False, - block_table=block_table, stage="succ", vertical_indices=succ_vertical_indices, slash_indices=succ_slash_indices, @@ -1115,7 +1111,6 @@ def _dual_chunk_flash_attn_prefill_func( v_states_inter, softmax_scale=softmax_scale, causal=False, - block_table=block_table, stage="inter", vertical_indices=inter_vertical_buffer, slash_indices=inter_slash_buffer, @@ -1130,7 +1125,6 @@ def _dual_chunk_flash_attn_prefill_func( v_states_inter, softmax_scale=softmax_scale, causal=False, - block_table=block_table, stage="inter", vertical_indices=inter_vertical_indices, slash_indices=inter_slash_indices, @@ -1151,7 +1145,6 @@ def _do_flash_attn( value_states: torch.Tensor, softmax_scale: float, causal: bool = True, - block_table: torch.Tensor = None, max_seqlen_k: Optional[int] = None, stage: str = "intra", vertical_indices: Optional[torch.Tensor] = None, @@ -1230,7 +1223,6 @@ def _do_flash_attn( device=query_states.device), max_seqlen_k=max_seqlen_k, causal=causal, - block_table=block_table.unsqueeze(0), return_softmax_lse=True, ) softmax_lse = softmax_lse.view(q_len, q_heads, 1).transpose(0, From 09c7ebb4933a10f33a0ad52fa96f03e3313e38c7 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 23 Jul 2025 15:49:25 +0100 Subject: [PATCH 60/63] [Tests] Add tests for headless internal DP LB (#21450) Signed-off-by: Nick Hill Signed-off-by: qizixi --- .buildkite/test-pipeline.yaml | 2 + .../openai/test_multi_api_servers.py | 123 +--- tests/v1/test_internal_lb_dp.py | 639 ++++++++++++++++++ tests/v1/test_utils.py | 124 ++++ 4 files changed, 768 insertions(+), 120 deletions(-) create mode 100644 tests/v1/test_internal_lb_dp.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 00608229b95e..c7378bf8ba5e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -165,6 +165,7 @@ steps: - tests/examples/offline_inference/data_parallel.py - tests/v1/test_async_llm_dp.py - tests/v1/test_external_lb_dp.py + - tests/v1/test_internal_lb_dp.py - tests/v1/engine/test_engine_core_client.py commands: # test with tp=2 and external_dp=2 @@ -176,6 +177,7 @@ steps: - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py index e84b5e3095d0..f7c31b0c4377 100644 --- a/tests/v1/entrypoints/openai/test_multi_api_servers.py +++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py @@ -2,136 +2,19 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os -import re import openai # use the official client for correctness check import pytest import pytest_asyncio -import requests from tests.utils import RemoteOpenAIServer +from tests.v1.test_utils import check_request_balancing MODEL_NAME = "ibm-research/PowerMoE-3b" DP_SIZE = os.getenv("DP_SIZE", "1") -def get_prometheus_metrics( - server: RemoteOpenAIServer) -> dict[str, dict[str, float]]: - """Fetch and parse Prometheus metrics from the /metrics endpoint. - - Returns: - Dict mapping metric names to their values grouped by labels. - For example: {"vllm:request_success": { - "engine=0": 5.0, "engine=1": 3.0} - } - """ - try: - response = requests.get(server.url_for("metrics"), timeout=10) - response.raise_for_status() - - metrics: dict[str, dict[str, float]] = {} - - # Regex patterns for Prometheus metrics - metric_with_labels = re.compile( - r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$') - metric_simple = re.compile( - r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$') - - for line in response.text.split('\n'): - line = line.strip() - # Skip comments and empty lines - if not line or line.startswith('#'): - continue - - # Try to match metric with labels first - match = metric_with_labels.match(line) - if match: - metric_name, labels_part, value_str = match.groups() - try: - value = float(value_str) - if metric_name not in metrics: - metrics[metric_name] = {} - metrics[metric_name][f'{{{labels_part}}}'] = value - except ValueError: - continue - else: - # Try simple metric without labels - match = metric_simple.match(line) - if match: - metric_name, value_str = match.groups() - try: - value = float(value_str) - if metric_name not in metrics: - metrics[metric_name] = {} - metrics[metric_name][''] = value - except ValueError: - continue - - return metrics - except Exception as e: - pytest.fail(f"Failed to fetch Prometheus metrics: {e}") - return {} - - -def get_engine_request_counts( - metrics: dict[str, dict[str, float]]) -> dict[str, float]: - """Extract request counts per engine from Prometheus metrics. - - Returns: - Dict mapping engine indices to request counts. - For example: {"0": 15.0, "1": 12.0} - """ - engine_counts = {} - - # Look for request success metrics with engine labels - success_metrics = metrics.get("vllm:request_success_total", {}) - engine_pattern = re.compile(r'engine="([^"]*)"') - - for labels, count in success_metrics.items(): - # Extract engine ID from labels using regex - match = engine_pattern.search(labels) - if match: - engine_id = match.group(1) - if engine_id not in engine_counts: - engine_counts[engine_id] = 0.0 - engine_counts[engine_id] += count - - return engine_counts - - -def check_request_balancing(server: RemoteOpenAIServer): - """Check request balancing via Prometheus metrics if DP_SIZE > 1. - - Args: - server: The RemoteOpenAIServer instance - """ - dp_size = int(DP_SIZE) - if dp_size <= 1: - return - - # Get metrics after all requests are completed - metrics = get_prometheus_metrics(server) - engine_counts = get_engine_request_counts(metrics) - - # Check that multiple engines received requests - engines_with_requests = [ - engine for engine, count in engine_counts.items() if count > 0 - ] - assert len(engines_with_requests) == dp_size, ( - f"Expected requests to be distributed across multiple engines," - f" but only engine(s) {engines_with_requests} received " - f"requests. Engine counts: {engine_counts}") - - # Verify that the load is reasonably balanced - # (no engine should handle all requests) - total_requests = sum(engine_counts.values()) - - for count in engine_counts.values(): - assert count > total_requests // (dp_size + 1), ( - f"requests are imbalanced: {engine_counts}") - - @pytest.fixture(scope="module") def default_server_args(): return [ @@ -217,7 +100,7 @@ async def make_request(): assert all(completion is not None for completion in results) # Check request balancing via Prometheus metrics if DP_SIZE > 1 - check_request_balancing(server) + check_request_balancing(server, int(DP_SIZE)) @pytest.mark.asyncio @@ -295,4 +178,4 @@ async def make_streaming_request(): assert all(results), "Not all streaming requests completed successfully." # Check request balancing via Prometheus metrics if DP_SIZE > 1 - check_request_balancing(server) + check_request_balancing(server, int(DP_SIZE)) diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py new file mode 100644 index 000000000000..9aef4d5821e8 --- /dev/null +++ b/tests/v1/test_internal_lb_dp.py @@ -0,0 +1,639 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import os +import threading +import time + +import openai # use the official client for correctness check +import pytest +import pytest_asyncio + +from tests.utils import RemoteOpenAIServer +from tests.v1.test_utils import check_request_balancing +from vllm.platforms import Platform + +MODEL_NAME = "ibm-research/PowerMoE-3b" + +# Number of data parallel ranks for multi-node internal LB testing +DP_SIZE = int(os.getenv("DP_SIZE", "2")) +# Default tensor parallel size to use +TP_SIZE = int(os.getenv("TP_SIZE", "1")) + +# Number of nodes to simulate +NUM_NODES = 2 + + +class MultinodeInternalLBServerManager: + """Manages multi-node data parallel vLLM server instances for internal + load balancer testing using --headless mode.""" + + def __init__(self, + model_name: str, + dp_size: int, + api_server_count: int, + base_server_args: list, + dp_per_node: int = 1, + tp_size: int = TP_SIZE): + self.model_name = model_name + self.dp_size = dp_size + self.dp_per_node = dp_per_node + self.tp_size = tp_size + self.api_server_count = api_server_count + self.base_server_args = base_server_args + self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = [] + self.server_threads: list[threading.Thread] = [] + + def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]: + """Start all server instances for multi-node internal LB mode.""" + for rank in range(0, self.dp_size, self.dp_per_node): + # Create server args for this specific rank + server_args = self.base_server_args.copy() + + if rank == 0: + # Head node - runs API server and first DP rank + server_args.extend([ + "--data-parallel-size", + str(self.dp_size), + "--data-parallel-size-local", + str(self.dp_per_node), + "--tensor-parallel-size", + str(self.tp_size), + "--port", + "8000", # Single endpoint for all requests + "--api-server-count", + str(self.api_server_count), + "--data-parallel-address", + "127.0.0.1", + "--data-parallel-rpc-port", + "13345", + ]) + else: + # Secondary nodes - run in headless mode + server_args.extend([ + "--headless", + "--data-parallel-size", + str(self.dp_size), + "--data-parallel-size-local", + str(self.dp_per_node), + "--data-parallel-start-rank", + str(rank), + "--tensor-parallel-size", + str(self.tp_size), + "--data-parallel-address", + "127.0.0.1", + "--data-parallel-rpc-port", + "13345", + ]) + + # Use a thread to start each server to allow parallel initialization + def start_server(r: int, sargs: list[str]): + gpus_per_node = self.tp_size * self.dp_per_node + try: + # Start the server + server = RemoteOpenAIServer( + self.model_name, + sargs, + auto_port=False, + env_dict={ + "CUDA_VISIBLE_DEVICES": + ",".join( + str(Platform.device_id_to_physical_device_id( + i)) for i in range(r, r + gpus_per_node)) + }) + server.__enter__() + if r == 0: + print( + f"Head node (rank {r}) started successfully with " + f"{self.api_server_count} API servers") + else: + print(f"Headless node (rank {r}) started successfully") + self.servers.append((server, sargs)) + except Exception as e: + print(f"Failed to start server rank {r}: {e}") + raise + + thread = threading.Thread(target=start_server, + args=(rank, server_args)) + thread.start() + + self.server_threads.append(thread) + + # Wait for all servers to start + for thread in self.server_threads: + thread.join() + + # Give servers additional time to fully initialize and coordinate + time.sleep(3) + + if len(self.servers) != self.dp_size // self.dp_per_node: + raise Exception("Servers failed to start") + + return self.servers + + def __exit__(self, exc_type, exc_val, exc_tb): + """Stop all server instances.""" + while self.servers: + try: + self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb) + except Exception as e: + print(f"Error stopping server: {e}") + + +class APIOnlyServerManager: + """Manages API-only server (Node 0) and headless engines server (Node 1) + for testing separated API server and engine configuration.""" + + def __init__(self, + model_name: str, + dp_size: int, + api_server_count: int, + base_server_args: list, + tp_size: int = TP_SIZE): + self.model_name = model_name + self.dp_size = dp_size + self.tp_size = tp_size + self.api_server_count = api_server_count + self.base_server_args = base_server_args + self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = [] + self.server_threads: list[threading.Thread] = [] + + def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]: + """Start API-only server and headless engines server.""" + + # Start API-only server (Node 0) - no engines, only API server + api_server_args = self.base_server_args.copy() + api_server_args.extend([ + "--data-parallel-size", + str(self.dp_size), + "--data-parallel-size-local", + "0", # No engines on this node + "--tensor-parallel-size", + str(self.tp_size), + "--port", + "8000", + "--api-server-count", + str(self.api_server_count), + "--data-parallel-address", + "127.0.0.1", + "--data-parallel-rpc-port", + "13345", + ]) + + # Start headless engines server (Node 1) - all engines, no API server + engines_server_args = self.base_server_args.copy() + engines_server_args.extend([ + "--headless", + "--data-parallel-size", + str(self.dp_size), + "--data-parallel-size-local", + str(self.dp_size), # All engines on this node + "--tensor-parallel-size", + str(self.tp_size), + "--data-parallel-address", + "127.0.0.1", + "--data-parallel-rpc-port", + "13345", + ]) + + # Use threads to start both servers in parallel + def start_api_server(): + try: + server = RemoteOpenAIServer( + self.model_name, + api_server_args, + auto_port=False, + env_dict={}) # No GPUs needed for API-only server + server.__enter__() + print(f"API-only server started successfully with " + f"{self.api_server_count} API servers") + self.servers.append((server, api_server_args)) + except Exception as e: + print(f"Failed to start API-only server: {e}") + raise + + def start_engines_server(): + try: + server = RemoteOpenAIServer( + self.model_name, + engines_server_args, + auto_port=False, + env_dict={ + "CUDA_VISIBLE_DEVICES": + ",".join( + str(Platform.device_id_to_physical_device_id(i)) + for i in range(self.dp_size * self.tp_size)) + }) + server.__enter__() + print(f"Headless engines server started successfully with " + f"{self.dp_size} engines") + self.servers.append((server, engines_server_args)) + except Exception as e: + print(f"Failed to start headless engines server: {e}") + raise + + # Start API server first + api_thread = threading.Thread(target=start_api_server) + api_thread.start() + self.server_threads.append(api_thread) + + # Start engines server second + engines_thread = threading.Thread(target=start_engines_server) + engines_thread.start() + self.server_threads.append(engines_thread) + + # Wait for both servers to start + for thread in self.server_threads: + thread.join() + + # Give servers additional time to fully initialize and coordinate + time.sleep(3) + + if len(self.servers) != 2: + raise Exception("Both servers failed to start") + + return self.servers + + def __exit__(self, exc_type, exc_val, exc_tb): + """Stop both server instances.""" + while self.servers: + try: + self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb) + except Exception as e: + print(f"Error stopping server: {e}") + + +@pytest.fixture(scope="module") +def default_server_args(): + return [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "128", + "--enforce-eager", + ] + + +@pytest.fixture(scope="module", params=[1, 4]) +def servers(request, default_server_args): + api_server_count = request.param + with MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE, + api_server_count, + default_server_args, + DP_SIZE // NUM_NODES, + TP_SIZE) as server_list: + yield server_list + + +@pytest.fixture(scope="module", params=[1, 4]) +def api_only_servers(request, default_server_args): + """Fixture for API-only server + headless engines configuration.""" + api_server_count = request.param + with APIOnlyServerManager(MODEL_NAME, DP_SIZE, api_server_count, + default_server_args, TP_SIZE) as server_list: + yield server_list + + +@pytest_asyncio.fixture +async def client(servers: list[tuple[RemoteOpenAIServer, list[str]]]): + # For internal LB, we only connect to the head node (rank 0) + # which provides the single API endpoint + head_server = servers[0][0] + async with head_server.get_async_client() as client: + yield client + + +@pytest_asyncio.fixture +async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer, + list[str]]]): + """Client fixture for API-only server configuration.""" + # Connect to the API-only server (first server in the list) + api_server = api_only_servers[0][0] + async with api_server.get_async_client() as client: + yield client + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_multinode_dp_completion(client: openai.AsyncOpenAI, + servers: list[tuple[RemoteOpenAIServer, + list[str]]], + model_name: str) -> None: + + async def make_request(): + completion = await client.completions.create( + model=model_name, + prompt="Hello, my name is", + max_tokens=10, + temperature=1.0) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 1 + + choice = completion.choices[0] + # The exact number of tokens can vary slightly with temperature=1.0, + # so we check for a reasonable minimum length. + assert len(choice.text) >= 1 + # Finish reason might not always be 'length' if the model finishes early + # or due to other reasons, especially with high temperature. + # So, we'll accept 'length' or 'stop'. + assert choice.finish_reason in ("length", "stop") + + # Token counts can also vary, so we check they are positive. + assert completion.usage.completion_tokens > 0 + assert completion.usage.prompt_tokens > 0 + assert completion.usage.total_tokens > 0 + return completion + + # Test single request + result = await make_request() + assert result is not None + print( + "Multi-node internal LB handled single completion request successfully" + ) + + await asyncio.sleep(0.5) + + # Send multiple requests - internal LB should distribute across DP ranks + num_requests = 50 + all_tasks = [make_request() for _ in range(num_requests)] + + results = await asyncio.gather(*all_tasks) + assert len(results) == num_requests + assert all(completion is not None for completion in results) + + await asyncio.sleep(0.5) + + # Second burst of requests + all_tasks = [make_request() for _ in range(num_requests)] + + results = await asyncio.gather(*all_tasks) + assert len(results) == num_requests + assert all(completion is not None for completion in results) + + _, server_args = servers[0] + api_server_count = ( + server_args.count('--api-server-count') + and server_args[server_args.index('--api-server-count') + 1] or 1) + print(f"Successfully completed multi-node internal LB test with " + f"{len(servers)} DP ranks (API server count: {api_server_count})") + + # Check request balancing via Prometheus metrics + head_server = servers[0][0] + check_request_balancing(head_server, DP_SIZE) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI, + servers: list[ + tuple[RemoteOpenAIServer, + list[str]]], + model_name: str) -> None: + prompt = "What is an LLM?" + + async def make_streaming_request(): + # Perform a non-streaming request to get the expected full output + single_completion = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + ) + single_output = single_completion.choices[0].text + + # Perform the streaming request + stream = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True) + chunks: list[str] = [] + finish_reason_count = 0 + last_chunk = None + async for chunk in stream: + chunks.append(chunk.choices[0].text) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + last_chunk = chunk # Keep track of the last chunk + + # finish reason should only return in the last block for OpenAI API + assert finish_reason_count == 1, ( + "Finish reason should appear exactly once.") + assert last_chunk is not None, ( + "Stream should have yielded at least one chunk.") + assert last_chunk.choices[ + 0].finish_reason == "length", "Finish reason should be 'length'." + # Check that the combined text matches the non-streamed version. + assert "".join( + chunks + ) == single_output, "Streamed output should match non-streamed output." + return True # Indicate success for this request + + # Test single streaming request + result = await make_streaming_request() + assert result is not None + print( + "Multi-node internal LB handled single streaming request successfully") + + await asyncio.sleep(0.5) + + # Send multiple streaming requests - internal LB should distribute across + # DP ranks + num_requests = 50 + all_tasks = [make_streaming_request() for _ in range(num_requests)] + + results = await asyncio.gather(*all_tasks) + assert len(results) == num_requests + assert all(results), "Not all streaming requests completed successfully." + + await asyncio.sleep(0.5) + + # Second burst of streaming requests + all_tasks = [make_streaming_request() for _ in range(num_requests)] + + results = await asyncio.gather(*all_tasks) + assert len(results) == num_requests + assert all(results), "Not all streaming requests completed successfully." + + _, server_args = servers[0] + api_server_count = ( + server_args.count('--api-server-count') + and server_args[server_args.index('--api-server-count') + 1] or 1) + print(f"Successfully completed multi-node internal LB streaming test with " + f"{len(servers)} DP ranks (API server count: {api_server_count})") + + # Check request balancing via Prometheus metrics + head_server = servers[0][0] + check_request_balancing(head_server, DP_SIZE) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_api_only_multinode_dp_completion( + api_only_client: openai.AsyncOpenAI, + api_only_servers: list[tuple[RemoteOpenAIServer, + list[str]]], model_name: str) -> None: + """Test API-only server with all engines on separate headless server.""" + + async def make_request(): + completion = await api_only_client.completions.create( + model=model_name, + prompt="Hello, my name is", + max_tokens=10, + temperature=1.0) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 1 + + choice = completion.choices[0] + # The exact number of tokens can vary slightly with temperature=1.0, + # so we check for a reasonable minimum length. + assert len(choice.text) >= 1 + # Finish reason might not always be 'length' if the model finishes + # early or due to other reasons, especially with high temperature. + # So, we'll accept 'length' or 'stop'. + assert choice.finish_reason in ("length", "stop") + + # Token counts can also vary, so we check they are positive. + assert completion.usage.completion_tokens > 0 + assert completion.usage.prompt_tokens > 0 + assert completion.usage.total_tokens > 0 + return completion + + # Test single request + result = await make_request() + assert result is not None + print("API-only server handled single completion request successfully") + + await asyncio.sleep(0.5) + + # Send multiple requests - should be distributed across engines on + # headless server + num_requests = 50 + all_tasks = [make_request() for _ in range(num_requests)] + + results = await asyncio.gather(*all_tasks) + assert len(results) == num_requests + assert all(completion is not None for completion in results) + + await asyncio.sleep(0.5) + + # Second burst of requests + all_tasks = [make_request() for _ in range(num_requests)] + + results = await asyncio.gather(*all_tasks) + assert len(results) == num_requests + assert all(completion is not None for completion in results) + + _, api_server_args = api_only_servers[0] + api_server_count = ( + api_server_args.count('--api-server-count') + and api_server_args[api_server_args.index('--api-server-count') + 1] + or 1) + print(f"Successfully completed API-only multi-node test with {DP_SIZE} " + f"engines on headless server (API server count: {api_server_count})") + + # Check request balancing via Prometheus metrics + api_server = api_only_servers[0][0] + check_request_balancing(api_server, DP_SIZE) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_api_only_multinode_dp_completion_streaming( + api_only_client: openai.AsyncOpenAI, + api_only_servers: list[tuple[RemoteOpenAIServer, + list[str]]], model_name: str) -> None: + """Test API-only server streaming with all engines on separate + headless server.""" + prompt = "What is an LLM?" + + async def make_streaming_request(): + # Perform a non-streaming request to get the expected full output + single_completion = await api_only_client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + ) + single_output = single_completion.choices[0].text + + # Perform the streaming request + stream = await api_only_client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True) + chunks: list[str] = [] + finish_reason_count = 0 + last_chunk = None + async for chunk in stream: + chunks.append(chunk.choices[0].text) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + last_chunk = chunk # Keep track of the last chunk + + # finish reason should only return in the last block for OpenAI API + assert finish_reason_count == 1, ( + "Finish reason should appear exactly once.") + assert last_chunk is not None, ( + "Stream should have yielded at least one chunk.") + assert last_chunk.choices[ + 0].finish_reason == "length", "Finish reason should be 'length'." + # Check that the combined text matches the non-streamed version. + assert "".join( + chunks + ) == single_output, "Streamed output should match non-streamed output." + return True # Indicate success for this request + + # Test single streaming request + result = await make_streaming_request() + assert result is not None + print("API-only server handled single streaming request successfully") + + await asyncio.sleep(0.5) + + # Send multiple streaming requests - should be distributed across engines + num_requests = 50 + all_tasks = [make_streaming_request() for _ in range(num_requests)] + + results = await asyncio.gather(*all_tasks) + assert len(results) == num_requests + assert all(results), "Not all streaming requests completed successfully." + + await asyncio.sleep(0.5) + + # Second burst of streaming requests + all_tasks = [make_streaming_request() for _ in range(num_requests)] + + results = await asyncio.gather(*all_tasks) + assert len(results) == num_requests + assert all(results), "Not all streaming requests completed successfully." + + _, api_server_args = api_only_servers[0] + api_server_count = ( + api_server_args.count('--api-server-count') + and api_server_args[api_server_args.index('--api-server-count') + 1] + or 1) + print(f"Successfully completed API-only streaming test with {DP_SIZE} " + f"engines on headless server (API server count: {api_server_count})") + + # Check request balancing via Prometheus metrics + api_server = api_only_servers[0][0] + check_request_balancing(api_server, DP_SIZE) diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py index fd0e630ce178..0b892bd9dffd 100644 --- a/tests/v1/test_utils.py +++ b/tests/v1/test_utils.py @@ -1,8 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import re + +import pytest +import requests import torch +from tests.utils import RemoteOpenAIServer from vllm.v1.worker.utils import bind_kv_cache @@ -61,3 +66,122 @@ def test_bind_kv_cache_non_attention(): assert runner_kv_caches[0] is kv_cache['model.layers.20.attn'] assert runner_kv_caches[1] is kv_cache['model.layers.28.attn'] + + +# Prometheus metrics utilities for testing + + +def get_prometheus_metrics( + server: RemoteOpenAIServer) -> dict[str, dict[str, float]]: + """Fetch and parse Prometheus metrics from the /metrics endpoint. + + Returns: + Dict mapping metric names to their values grouped by labels. + For example: {"vllm:request_success": { + "engine=0": 5.0, "engine=1": 3.0} + } + """ + try: + response = requests.get(server.url_for("metrics"), timeout=10) + response.raise_for_status() + + metrics: dict[str, dict[str, float]] = {} + + # Regex patterns for Prometheus metrics + metric_with_labels = re.compile( + r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$') + metric_simple = re.compile( + r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$') + + for line in response.text.split('\n'): + line = line.strip() + # Skip comments and empty lines + if not line or line.startswith('#'): + continue + + # Try to match metric with labels first + match = metric_with_labels.match(line) + if match: + metric_name, labels_part, value_str = match.groups() + try: + value = float(value_str) + if metric_name not in metrics: + metrics[metric_name] = {} + metrics[metric_name][f'{{{labels_part}}}'] = value + except ValueError: + continue + else: + # Try simple metric without labels + match = metric_simple.match(line) + if match: + metric_name, value_str = match.groups() + try: + value = float(value_str) + if metric_name not in metrics: + metrics[metric_name] = {} + metrics[metric_name][''] = value + except ValueError: + continue + + return metrics + except Exception as e: + pytest.fail(f"Failed to fetch Prometheus metrics: {e}") + return {} + + +def get_engine_request_counts( + metrics: dict[str, dict[str, float]]) -> dict[str, float]: + """Extract request counts per engine from Prometheus metrics. + + Returns: + Dict mapping engine indices to request counts. + For example: {"0": 15.0, "1": 12.0} + """ + engine_counts = {} + + # Look for request success metrics with engine labels + success_metrics = metrics.get("vllm:request_success_total", {}) + engine_pattern = re.compile(r'engine="([^"]*)"') + + for labels, count in success_metrics.items(): + # Extract engine ID from labels using regex + match = engine_pattern.search(labels) + if match: + engine_id = match.group(1) + if engine_id not in engine_counts: + engine_counts[engine_id] = 0.0 + engine_counts[engine_id] += count + + return engine_counts + + +def check_request_balancing(server: RemoteOpenAIServer, dp_size: int): + """Check request balancing via Prometheus metrics if dp_size > 1. + + Args: + server: The RemoteOpenAIServer instance + dp_size: Number of data parallel ranks + """ + if dp_size <= 1: + return + + # Get metrics after all requests are completed + metrics = get_prometheus_metrics(server) + engine_counts = get_engine_request_counts(metrics) + + # Check that multiple engines received requests + engines_with_requests = [ + engine for engine, count in engine_counts.items() if count > 0 + ] + assert len(engines_with_requests) == dp_size, ( + f"Expected requests to be distributed across multiple engines," + f" but only engine(s) {engines_with_requests} received " + f"requests. Engine counts: {engine_counts}") + + # Verify that the load is reasonably balanced + # (no engine should handle all requests) + total_requests = sum(engine_counts.values()) + + for count in engine_counts.values(): + assert count > total_requests // (dp_size + 1), ( + f"requests are imbalanced: {engine_counts}") From 5d0155d4f4b6ac6226ae6314a9d3b7f4046b27b8 Mon Sep 17 00:00:00 2001 From: Christian Pinto Date: Wed, 23 Jul 2025 19:00:23 +0100 Subject: [PATCH 61/63] [Core][Model] PrithviMAE Enablement on vLLM v1 engine (#20577) Signed-off-by: Christian Pinto Signed-off-by: qizixi --- .../prithvi_geospatial_mae.py | 245 ++++-------- requirements/test.in | 1 + requirements/test.txt | 374 +++++++++++++++++- .../multimodal/pooling/test_prithvi_mae.py | 63 +++ vllm/config.py | 6 +- vllm/engine/llm_engine.py | 10 +- vllm/model_executor/models/interfaces.py | 34 ++ .../models/prithvi_geospatial_mae.py | 74 +++- vllm/model_executor/models/registry.py | 13 +- vllm/multimodal/registry.py | 2 +- vllm/v1/engine/async_llm.py | 17 +- vllm/v1/engine/llm_engine.py | 13 +- vllm/v1/engine/output_processor.py | 18 +- vllm/v1/engine/processor.py | 12 +- vllm/v1/worker/gpu_model_runner.py | 60 +++ 15 files changed, 704 insertions(+), 238 deletions(-) create mode 100644 tests/models/multimodal/pooling/test_prithvi_mae.py diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index 6dc03e85baa9..4fdc7a3cf709 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -1,122 +1,27 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This is a demo script showing how to use the -PrithviGeospatialMAE model with vLLM -This script is based on: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/blob/main/inference.py # noqa - -Target model weights: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/resolve/main/Prithvi-EO-V2-300M-TL-Sen1Floods11.pt # noqa - -The requirements for running this script are: -- Installing [terratorch, albumentations, rasterio] in your python environment -- downloading the model weights in a 'model' folder local to the script - (temporary measure until the proper config.json file is uploaded to HF) -- download an input example image (India_900498_S2Hand.tif) and place it in - the same folder with the script (or specify with the --data_file argument) - -Run the example: -python prithvi_geospatial_mae.py - -""" # noqa: E501 - import argparse import datetime import os +import re from typing import Union import albumentations import numpy as np import rasterio -import regex as re import torch from einops import rearrange from terratorch.datamodules import Sen1Floods11NonGeoDataModule from vllm import LLM +torch.set_default_dtype(torch.float16) + NO_DATA = -9999 NO_DATA_FLOAT = 0.0001 OFFSET = 0 PERCENTILE = 99 -model_config = """{ - "architectures": ["PrithviGeoSpatialMAE"], - "num_classes": 0, - "pretrained_cfg": { - "task_args": { - "task": "SemanticSegmentationTask", - "model_factory": "EncoderDecoderFactory", - "loss": "ce", - "ignore_index": -1, - "lr": 0.001, - "freeze_backbone": false, - "freeze_decoder": false, - "plot_on_val": 10, - "optimizer": "AdamW", - "scheduler": "CosineAnnealingLR" - }, - "model_args": { - "backbone_pretrained": false, - "backbone": "prithvi_eo_v2_300_tl", - "decoder": "UperNetDecoder", - "decoder_channels": 256, - "decoder_scale_modules": true, - "num_classes": 2, - "rescale": true, - "backbone_bands": [ - "BLUE", - "GREEN", - "RED", - "NIR_NARROW", - "SWIR_1", - "SWIR_2" - ], - "head_dropout": 0.1, - "necks": [ - { - "name": "SelectIndices", - "indices": [ - 5, - 11, - 17, - 23 - ] - }, - { - "name": "ReshapeTokensToImage" - } - ] - }, - "optimizer_params" : { - "lr": 5.0e-05, - "betas": [0.9, 0.999], - "eps": [1.0e-08], - "weight_decay": 0.05, - "amsgrad": false, - "maximize": false, - "capturable": false, - "differentiable": false - }, - "scheduler_params" : { - "T_max": 50, - "eta_min": 0, - "last_epoch": -1, - "verbose": "deprecated" - } - }, - - - "torch_dtype": "float32" -} -""" - -# Temporarily creating the "config.json" for the model. -# This is going to disappear once the correct config.json is available on HF -with open( - os.path.join(os.path.dirname(__file__), "./model/config.json"), "w" -) as config_file: - config_file.write(model_config) - datamodule_config = { "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"], "batch_size": 16, @@ -138,28 +43,24 @@ class PrithviMAE: - def __init__(self): - print("Initializing PrithviMAE model") - self.llm = LLM( - model=os.path.join(os.path.dirname(__file__), "./model"), - skip_tokenizer_init=True, - dtype="float32", + def __init__(self, model): + self.model = LLM( + model=model, skip_tokenizer_init=True, dtype="float16", enforce_eager=True ) def run(self, input_data, location_coords): - print("################ Running inference on vLLM ##############") # merge the inputs into one data structure + if input_data is not None and input_data.dtype == torch.float32: + input_data = input_data.to(torch.float16) + input_data = input_data[0] + mm_data = { - "pixel_values": torch.empty(0) if input_data is None else input_data, - "location_coords": torch.empty(0) - if location_coords is None - else location_coords, + "pixel_values": input_data, + "location_coords": location_coords, } prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data} - - outputs = self.llm.encode(prompt, use_tqdm=False) - print("################ Inference done (it took seconds) ##############") + outputs = self.model.encode(prompt, use_tqdm=False) return outputs[0].outputs.data @@ -181,11 +82,12 @@ def process_channel_group(orig_img, channels): """ Args: orig_img: torch.Tensor representing original image (reference) - with shape = (bands, H, W). + with shape = (bands, H, W). channels: list of indices representing RGB channels. Returns: - torch.Tensor with shape (num_channels, height, width) for original image + torch.Tensor with shape (num_channels, height, width) + for original image """ orig_img = orig_img[channels, ...] @@ -260,10 +162,10 @@ def load_example( Args: file_paths: list of file paths . - mean: list containing mean values for each band in the images - in *file_paths*. - std: list containing std values for each band in the images - in *file_paths*. + mean: list containing mean values for each band in the + images in *file_paths*. + std: list containing std values for each band in the + images in *file_paths*. Returns: np.array containing created example @@ -308,7 +210,7 @@ def load_example( print(f"Could not extract timestamp for {file} ({e})") imgs = np.stack(imgs, axis=0) # num_frames, H, W, C - imgs = np.moveaxis(imgs, -1, 0).astype("float32") + imgs = np.moveaxis(imgs, -1, 0).astype("float32") # C, num_frames, H, W imgs = np.expand_dims(imgs, axis=0) # add batch di return imgs, temporal_coords, location_coords, metas @@ -332,8 +234,10 @@ def run_model( ) # Build sliding window + batch_size = 1 - batch = torch.tensor(input_data, device="cpu") + # batch = torch.tensor(input_data, device="cpu") + batch = torch.tensor(input_data) windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size) h1, w1 = windows.shape[3:5] windows = rearrange( @@ -344,18 +248,16 @@ def run_model( num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1 windows = torch.tensor_split(windows, num_batches, dim=0) - device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - if temporal_coords: - temporal_coords = torch.tensor(temporal_coords, device=device).unsqueeze(0) + temporal_coords = torch.tensor(temporal_coords).unsqueeze(0) else: temporal_coords = None if location_coords: - location_coords = torch.tensor(location_coords[0], device=device).unsqueeze(0) + location_coords = torch.tensor(location_coords[0]).unsqueeze(0) else: location_coords = None - # Run model + # Run Prithvi-EO-V2-300M-TL-Sen1Floods11 pred_imgs = [] for x in windows: # Apply standardization @@ -363,15 +265,7 @@ def run_model( x = datamodule.aug(x)["image"] with torch.no_grad(): - x = x.to(device) pred = model.run(x, location_coords=location_coords) - if lightning_model: - pred_lightning = lightning_model( - x, temporal_coords=temporal_coords, location_coords=location_coords - ) - pred_lightning = pred_lightning.output.detach().cpu() - if not torch.equal(pred, pred_lightning): - print("Inference output is not equal") y_hat = pred.argmax(dim=1) y_hat = torch.nn.functional.interpolate( @@ -403,52 +297,18 @@ def run_model( return pred_imgs -def parse_args(): - parser = argparse.ArgumentParser("MAE run inference", add_help=False) - - parser.add_argument( - "--data_file", - type=str, - default="./India_900498_S2Hand.tif", - help="Path to the file.", - ) - parser.add_argument( - "--output_dir", - type=str, - default="output", - help="Path to the directory where to save outputs.", - ) - parser.add_argument( - "--input_indices", - default=[1, 2, 3, 8, 11, 12], - type=int, - nargs="+", - help="0-based indices of the six Prithvi channels to be selected from the " - "input. By default selects [1,2,3,8,11,12] for S2L1C data.", - ) - parser.add_argument( - "--rgb_outputs", - action="store_true", - help="If present, output files will only contain RGB channels. " - "Otherwise, all bands will be saved.", - ) - - def main( data_file: str, + model: str, output_dir: str, rgb_outputs: bool, input_indices: list[int] = None, ): os.makedirs(output_dir, exist_ok=True) - # Load model --------------------------------------------------------------- - - model_obj = PrithviMAE() + model_obj = PrithviMAE(model=model) datamodule = generate_datamodule() - img_size = 256 # Size of Sen1Floods11 - - # Loading data ------------------------------------------------------------- + img_size = 512 # Size of Sen1Floods11 input_data, temporal_coords, location_coords, meta_data = load_example( file_paths=[data_file], @@ -460,8 +320,6 @@ def main( if input_data.mean() > 1: input_data = input_data / 10000 # Convert to range 0-1 - # Running model ------------------------------------------------------------ - channels = [ datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"] ] # BGR -> RGB @@ -469,7 +327,6 @@ def main( pred = run_model( input_data, temporal_coords, location_coords, model_obj, datamodule, img_size ) - # Save pred meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0) pred_file = os.path.join( @@ -487,6 +344,7 @@ def main( orig_img=torch.Tensor(input_data[0, :, 0, ...]), channels=channels, ) + rgb_orig = rgb_orig.to(torch.float32) pred[pred == 0.0] = np.nan img_pred = rgb_orig * 0.7 + pred * 0.3 @@ -503,9 +361,10 @@ def main( # Save image rgb if rgb_outputs: + name_suffix = os.path.splitext(os.path.basename(data_file))[0] rgb_file = os.path.join( output_dir, - f"original_rgb_{os.path.splitext(os.path.basename(data_file))[0]}.tiff", + f"original_rgb_{name_suffix}.tiff", ) save_geotiff( image=_convert_np_uint8(rgb_orig), @@ -515,6 +374,42 @@ def main( if __name__ == "__main__": - args = parse_args() + parser = argparse.ArgumentParser("MAE run inference", add_help=False) + + parser.add_argument( + "--data_file", + type=str, + default="./India_900498_S2Hand.tif", + help="Path to the file.", + ) + parser.add_argument( + "--model", + type=str, + default="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM", + help="Path to a checkpoint file to load from.", + ) + parser.add_argument( + "--output_dir", + type=str, + default="output", + help="Path to the directory where to save outputs.", + ) + parser.add_argument( + "--input_indices", + default=[1, 2, 3, 8, 11, 12], + type=int, + nargs="+", + help=""" + 0-based indices of the six Prithvi channels to be selected from the input. + By default selects [1,2,3,8,11,12] for S2L1C data. + """, + ) + parser.add_argument( + "--rgb_outputs", + action="store_true", + help="If present, output files will only contain RGB channels. " + "Otherwise, all bands will be saved.", + ) + args = parser.parse_args() main(**vars(args)) diff --git a/requirements/test.in b/requirements/test.in index c6c68891d6a6..9f66e2d6919a 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -54,3 +54,4 @@ runai-model-streamer==0.11.0 runai-model-streamer-s3==0.11.0 fastsafetensors>=0.1.10 pydantic>=2.10 # 2.9 leads to error on python 3.10 +terratorch==1.1rc2 # required for PrithviMAE test \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index aadbab03f6fc..a2b230102d4e 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -6,6 +6,10 @@ accelerate==1.0.1 # via # lm-eval # peft +aenum==3.1.16 + # via lightly +affine==2.4.0 + # via rasterio aiohappyeyeballs==2.4.3 # via aiohttp aiohttp==3.10.11 @@ -21,8 +25,18 @@ aiosignal==1.3.1 # via # aiohttp # ray +albucore==0.0.16 + # via terratorch +albumentations==1.4.6 + # via terratorch +alembic==1.16.4 + # via mlflow annotated-types==0.7.0 # via pydantic +antlr4-python3-runtime==4.9.3 + # via + # hydra-core + # omegaconf anyio==4.6.2.post1 # via # httpx @@ -34,10 +48,12 @@ arrow==1.3.0 attrs==24.2.0 # via # aiohttp + # fiona # hypothesis # jsonlines # jsonschema # pytest-subtests + # rasterio # referencing audioread==3.0.1 # via librosa @@ -46,9 +62,13 @@ backoff==2.2.1 # -r requirements/test.in # schemathesis bitsandbytes==0.46.1 - # via -r requirements/test.in + # via + # -r requirements/test.in + # lightning black==24.10.0 # via datamodel-code-generator +blinker==1.9.0 + # via flask blobfile==3.0.0 # via -r requirements/test.in bm25s==0.2.13 @@ -64,11 +84,18 @@ bounded-pool-executor==0.0.3 buildkite-test-collector==0.1.9 # via -r requirements/test.in cachetools==5.5.2 - # via google-auth + # via + # google-auth + # mlflow-skinny certifi==2024.8.30 # via + # fiona # httpcore # httpx + # lightly + # pyogrio + # pyproj + # rasterio # requests cffi==1.17.1 # via soundfile @@ -79,11 +106,28 @@ charset-normalizer==3.4.0 click==8.1.7 # via # black + # click-plugins + # cligj + # fiona + # flask # jiwer + # mlflow-skinny # nltk + # rasterio # ray # schemathesis # typer + # uvicorn +click-plugins==1.1.1.2 + # via + # fiona + # rasterio +cligj==0.7.2 + # via + # fiona + # rasterio +cloudpickle==3.1.1 + # via mlflow-skinny colorama==0.4.6 # via # sacrebleu @@ -99,6 +143,8 @@ cupy-cuda12x==13.3.0 # via ray cycler==0.12.1 # via matplotlib +databricks-sdk==0.59.0 + # via mlflow-skinny datamodel-code-generator==0.26.3 # via -r requirements/test.in dataproperty==1.0.1 @@ -122,13 +168,21 @@ distlib==0.3.9 # via virtualenv dnspython==2.7.0 # via email-validator +docker==7.1.0 + # via mlflow docopt==0.6.2 # via num2words -einops==0.8.0 +docstring-parser==0.17.0 + # via jsonargparse +efficientnet-pytorch==0.7.1 + # via segmentation-models-pytorch +einops==0.8.1 # via # -r requirements/test.in # encodec # mamba-ssm + # terratorch + # torchgeo # vector-quantize-pytorch # vocos einx==0.3.0 @@ -141,6 +195,8 @@ eval-type-backport==0.2.2 # via mteb evaluate==0.4.3 # via lm-eval +fastapi==0.116.1 + # via mlflow-skinny fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 @@ -156,6 +212,10 @@ filelock==3.16.1 # torch # transformers # virtualenv +fiona==1.10.1 + # via torchgeo +flask==3.1.1 + # via mlflow fonttools==4.54.1 # via matplotlib fqdn==1.5.1 @@ -173,6 +233,8 @@ fsspec==2024.9.0 # evaluate # fastparquet # huggingface-hub + # lightning + # pytorch-lightning # torch ftfy==6.3.1 # via open-clip-torch @@ -180,18 +242,41 @@ genai-perf==0.0.8 # via -r requirements/test.in genson==1.3.0 # via datamodel-code-generator +geopandas==1.0.1 + # via terratorch +gitdb==4.0.12 + # via gitpython +gitpython==3.1.44 + # via mlflow-skinny google-api-core==2.24.2 # via opencensus google-auth==2.40.2 - # via google-api-core + # via + # databricks-sdk + # google-api-core googleapis-common-protos==1.70.0 # via google-api-core +graphene==3.4.3 + # via mlflow graphql-core==3.2.6 - # via hypothesis-graphql + # via + # graphene + # graphql-relay + # hypothesis-graphql +graphql-relay==3.2.0 + # via graphene +greenlet==3.2.3 + # via sqlalchemy grpcio==1.71.0 # via ray +gunicorn==23.0.0 + # via mlflow h11==0.14.0 - # via httpcore + # via + # httpcore + # uvicorn +h5py==3.13.0 + # via terratorch harfile==0.3.0 # via schemathesis hf-xet==1.1.3 @@ -204,7 +289,7 @@ httpx==0.27.2 # via # -r requirements/test.in # schemathesis -huggingface-hub==0.33.0 +huggingface-hub==0.33.1 # via # -r requirements/test.in # accelerate @@ -212,13 +297,19 @@ huggingface-hub==0.33.0 # evaluate # open-clip-torch # peft + # segmentation-models-pytorch # sentence-transformers + # terratorch # timm # tokenizers # transformers # vocos humanize==4.11.0 # via runai-model-streamer +hydra-core==1.3.2 + # via + # lightly + # lightning hypothesis==6.131.0 # via # hypothesis-graphql @@ -236,6 +327,14 @@ idna==3.10 # jsonschema # requests # yarl +imageio==2.37.0 + # via scikit-image +importlib-metadata==8.7.0 + # via + # mlflow-skinny + # opentelemetry-api +importlib-resources==6.5.2 + # via typeshed-client inflect==5.6.2 # via datamodel-code-generator iniconfig==2.0.0 @@ -244,9 +343,13 @@ isoduration==20.11.0 # via jsonschema isort==5.13.2 # via datamodel-code-generator +itsdangerous==2.2.0 + # via flask jinja2==3.1.6 # via # datamodel-code-generator + # flask + # mlflow # torch jiwer==3.0.5 # via -r requirements/test.in @@ -259,6 +362,10 @@ joblib==1.4.2 # librosa # nltk # scikit-learn +jsonargparse==4.35.0 + # via + # lightning + # terratorch jsonlines==4.0.0 # via lm-eval jsonpointer==3.0.0 @@ -277,12 +384,33 @@ kaleido==0.2.1 # via genai-perf kiwisolver==1.4.7 # via matplotlib +kornia==0.8.1 + # via torchgeo +kornia-rs==0.1.9 + # via kornia lazy-loader==0.4 - # via librosa + # via + # librosa + # scikit-image libnacl==2.1.0 # via tensorizer librosa==0.10.2.post1 # via -r requirements/test.in +lightly==1.5.20 + # via + # terratorch + # torchgeo +lightly-utils==0.0.2 + # via lightly +lightning==2.5.1.post0 + # via + # terratorch + # torchgeo +lightning-utilities==0.14.3 + # via + # lightning + # pytorch-lightning + # torchmetrics llvmlite==0.44.0 # via numba lm-eval==0.4.8 @@ -291,16 +419,27 @@ lxml==5.3.0 # via # blobfile # sacrebleu +mako==1.3.10 + # via alembic mamba-ssm==2.2.4 # via -r requirements/test.in +markdown==3.8.2 + # via mlflow markdown-it-py==3.0.0 # via rich markupsafe==3.0.1 # via + # flask # jinja2 + # mako # werkzeug matplotlib==3.9.2 - # via -r requirements/test.in + # via + # -r requirements/test.in + # lightning + # mlflow + # pycocotools + # torchgeo mbstrdecoder==1.1.3 # via # dataproperty @@ -310,6 +449,10 @@ mdurl==0.1.2 # via markdown-it-py mistral-common==1.8.0 # via -r requirements/test.in +mlflow==2.22.0 + # via terratorch +mlflow-skinny==2.22.0 + # via mlflow more-itertools==10.5.0 # via lm-eval mpmath==1.3.0 @@ -328,10 +471,14 @@ multiprocess==0.70.16 # via # datasets # evaluate +munch==4.0.0 + # via pretrainedmodels mypy-extensions==1.0.0 # via black networkx==3.2.1 - # via torch + # via + # scikit-image + # torch ninja==1.11.1.3 # via mamba-ssm nltk==3.9.1 @@ -348,6 +495,8 @@ numpy==1.26.4 # via # -r requirements/test.in # accelerate + # albucore + # albumentations # bitsandbytes # bm25s # contourpy @@ -358,9 +507,15 @@ numpy==1.26.4 # evaluate # fastparquet # genai-perf + # geopandas + # h5py + # imageio # librosa + # lightly + # lightly-utils # matplotlib # mistral-common + # mlflow # mteb # numba # numexpr @@ -368,18 +523,30 @@ numpy==1.26.4 # pandas # patsy # peft + # pycocotools + # pyogrio + # rasterio + # rioxarray # rouge-score # runai-model-streamer # sacrebleu + # scikit-image # scikit-learn # scipy + # segmentation-models-pytorch + # shapely # soxr # statsmodels + # tensorboardx # tensorizer + # tifffile + # torchgeo + # torchmetrics # torchvision # transformers # tritonclient # vocos + # xarray nvidia-cublas-cu12==12.8.3.14 # via # nvidia-cudnn-cu12 @@ -417,6 +584,10 @@ nvidia-nvjitlink-cu12==12.8.61 # torch nvidia-nvtx-cu12==12.8.55 # via torch +omegaconf==2.3.0 + # via + # hydra-core + # lightning open-clip-torch==2.32.0 # via -r requirements/test.in opencensus==0.11.4 @@ -426,7 +597,18 @@ opencensus-context==0.1.3 opencv-python-headless==4.11.0.86 # via # -r requirements/test.in + # albucore + # albumentations # mistral-common +opentelemetry-api==1.35.0 + # via + # mlflow-skinny + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-sdk==1.35.0 + # via mlflow-skinny +opentelemetry-semantic-conventions==0.56b0 + # via opentelemetry-sdk packaging==24.2 # via # accelerate @@ -435,26 +617,44 @@ packaging==24.2 # datasets # evaluate # fastparquet + # geopandas + # gunicorn # huggingface-hub + # hydra-core + # kornia # lazy-loader + # lightning + # lightning-utilities # mamba-ssm # matplotlib + # mlflow-skinny # peft # plotly # pooch + # pyogrio # pytest # pytest-rerunfailures + # pytorch-lightning # ray + # rioxarray + # scikit-image # statsmodels + # tensorboardx + # torchmetrics # transformers # typepy + # xarray pandas==2.2.3 # via # datasets # evaluate # fastparquet # genai-perf + # geopandas + # mlflow # statsmodels + # torchgeo + # xarray pathspec==0.12.1 # via black pathvalidate==3.2.1 @@ -468,9 +668,14 @@ peft==0.13.2 pillow==10.4.0 # via # genai-perf + # imageio + # lightly-utils # matplotlib # mistral-common + # scikit-image + # segmentation-models-pytorch # sentence-transformers + # torchgeo # torchvision platformdirs==4.3.6 # via @@ -489,6 +694,8 @@ portalocker==2.10.1 # via sacrebleu pqdm==0.2.0 # via -r requirements/test.in +pretrainedmodels==0.7.4 + # via segmentation-models-pytorch prometheus-client==0.22.0 # via ray propcache==0.2.0 @@ -499,8 +706,10 @@ protobuf==5.28.3 # via # google-api-core # googleapis-common-protos + # mlflow-skinny # proto-plus # ray + # tensorboardx # tensorizer psutil==6.1.0 # via @@ -515,6 +724,7 @@ pyarrow==18.0.0 # via # datasets # genai-perf + # mlflow pyasn1==0.6.1 # via # pyasn1-modules @@ -523,6 +733,8 @@ pyasn1-modules==0.4.2 # via google-auth pybind11==2.13.6 # via lm-eval +pycocotools==2.0.8 + # via terratorch pycountry==24.6.1 # via pydantic-extra-types pycparser==2.22 @@ -532,8 +744,12 @@ pycryptodomex==3.22.0 pydantic==2.11.5 # via # -r requirements/test.in + # albumentations # datamodel-code-generator + # fastapi + # lightly # mistral-common + # mlflow-skinny # mteb # pydantic-extra-types # ray @@ -543,15 +759,24 @@ pydantic-extra-types==2.10.5 # via mistral-common pygments==2.18.0 # via rich +pyogrio==0.11.0 + # via geopandas pyparsing==3.2.0 - # via matplotlib + # via + # matplotlib + # rasterio +pyproj==3.7.1 + # via + # geopandas + # rioxarray + # torchgeo pyrate-limiter==3.7.0 # via schemathesis pystemmer==3.0.0 # via mteb pytablewriter==1.2.0 # via lm-eval -pytest==8.3.3 +pytest==8.3.5 # via # -r requirements/test.in # buildkite-test-collector @@ -564,6 +789,7 @@ pytest==8.3.3 # pytest-subtests # pytest-timeout # schemathesis + # terratorch pytest-asyncio==0.24.0 # via -r requirements/test.in pytest-forked==1.6.0 @@ -578,15 +804,23 @@ pytest-subtests==0.14.1 # via schemathesis pytest-timeout==2.3.1 # via -r requirements/test.in +python-box==7.3.2 + # via terratorch python-dateutil==2.9.0.post0 # via # arrow # botocore + # graphene + # lightly # matplotlib # pandas # typepy python-rapidjson==1.20 # via tritonclient +pytorch-lightning==2.5.2 + # via + # lightly + # lightning pytrec-eval-terrier==0.5.7 # via mteb pytz==2024.2 @@ -596,11 +830,17 @@ pytz==2024.2 pyyaml==6.0.2 # via # accelerate + # albumentations # datamodel-code-generator # datasets # genai-perf # huggingface-hub + # jsonargparse + # lightning + # mlflow-skinny + # omegaconf # peft + # pytorch-lightning # ray # responses # schemathesis @@ -609,6 +849,11 @@ pyyaml==6.0.2 # vocos rapidfuzz==3.12.1 # via jiwer +rasterio==1.4.3 + # via + # rioxarray + # terratorch + # torchgeo ray==2.43.0 # via -r requirements/test.in redis==5.2.0 @@ -627,12 +872,16 @@ regex==2024.9.11 requests==2.32.3 # via # buildkite-test-collector + # databricks-sdk # datasets + # docker # evaluate # google-api-core # huggingface-hub + # lightly # lm-eval # mistral-common + # mlflow-skinny # mteb # pooch # ray @@ -650,8 +899,11 @@ rfc3987==1.3.8 rich==13.9.4 # via # genai-perf + # lightning # mteb # typer +rioxarray==0.19.0 + # via terratorch rouge-score==0.1.2 # via lm-eval rpds-py==0.20.1 @@ -660,6 +912,8 @@ rpds-py==0.20.1 # referencing rsa==4.9.1 # via google-auth +rtree==1.4.0 + # via torchgeo runai-model-streamer==0.11.0 # via -r requirements/test.in runai-model-streamer-s3==0.11.0 @@ -677,21 +931,32 @@ safetensors==0.4.5 # transformers schemathesis==3.39.15 # via -r requirements/test.in +scikit-image==0.25.2 + # via albumentations scikit-learn==1.5.2 # via + # albumentations # librosa # lm-eval + # mlflow # mteb # sentence-transformers scipy==1.13.1 # via + # albumentations # bm25s # librosa + # mlflow # mteb + # scikit-image # scikit-learn # sentence-transformers # statsmodels # vocos +segmentation-models-pytorch==0.4.0 + # via + # terratorch + # torchgeo sentence-transformers==3.2.1 # via # -r requirements/test.in @@ -700,21 +965,30 @@ sentencepiece==0.2.0 # via mistral-common setuptools==77.0.3 # via + # lightning-utilities # mamba-ssm # pytablewriter # torch # triton +shapely==2.1.1 + # via + # geopandas + # torchgeo shellingham==1.5.4 # via typer six==1.16.0 # via # junit-xml + # lightly # opencensus # python-dateutil # rfc3339-validator # rouge-score + # segmentation-models-pytorch smart-open==7.1.0 # via ray +smmap==5.0.2 + # via gitdb sniffio==1.3.1 # via # anyio @@ -727,10 +1001,17 @@ soundfile==0.12.1 # librosa soxr==0.5.0.post1 # via librosa +sqlalchemy==2.0.41 + # via + # alembic + # mlflow sqlitedict==2.1.0 # via lm-eval +sqlparse==0.5.3 + # via mlflow-skinny starlette==0.46.2 # via + # fastapi # schemathesis # starlette-testclient starlette-testclient==0.4.1 @@ -751,18 +1032,29 @@ tenacity==9.0.0 # via # lm-eval # plotly +tensorboardx==2.6.4 + # via lightning tensorizer==2.10.1 # via -r requirements/test.in +terratorch==1.1rc2 + # via -r requirements/test.in threadpoolctl==3.5.0 # via scikit-learn +tifffile==2025.3.30 + # via + # scikit-image + # terratorch tiktoken==0.7.0 # via # lm-eval # mistral-common -timm==1.0.11 +timm==1.0.15 # via # -r requirements/test.in # open-clip-torch + # segmentation-models-pytorch + # terratorch + # torchgeo tokenizers==0.21.1 # via # -r requirements/test.in @@ -776,18 +1068,28 @@ torch==2.7.1+cu128 # -r requirements/test.in # accelerate # bitsandbytes + # efficientnet-pytorch # encodec # fastsafetensors + # kornia + # lightly + # lightning # lm-eval # mamba-ssm # mteb # open-clip-torch # peft + # pretrainedmodels + # pytorch-lightning # runai-model-streamer + # segmentation-models-pytorch # sentence-transformers # tensorizer + # terratorch # timm # torchaudio + # torchgeo + # torchmetrics # torchvision # vector-quantize-pytorch # vocos @@ -796,22 +1098,40 @@ torchaudio==2.7.1+cu128 # -r requirements/test.in # encodec # vocos +torchgeo==0.7.0 + # via terratorch +torchmetrics==1.7.4 + # via + # lightning + # pytorch-lightning + # terratorch + # torchgeo torchvision==0.22.1+cu128 # via # -r requirements/test.in + # lightly # open-clip-torch + # pretrainedmodels + # segmentation-models-pytorch + # terratorch # timm + # torchgeo tqdm==4.66.6 # via # datasets # evaluate # huggingface-hub + # lightly + # lightning # lm-eval # mteb # nltk # open-clip-torch # peft # pqdm + # pretrainedmodels + # pytorch-lightning + # segmentation-models-pytorch # sentence-transformers # tqdm-multiprocess # transformers @@ -843,18 +1163,34 @@ typer==0.15.2 # via fastsafetensors types-python-dateutil==2.9.0.20241206 # via arrow +typeshed-client==2.8.2 + # via jsonargparse typing-extensions==4.12.2 # via + # albumentations + # alembic + # fastapi + # graphene # huggingface-hub # librosa + # lightning + # lightning-utilities # mistral-common + # mlflow-skinny # mteb + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions # pqdm # pydantic # pydantic-core # pydantic-extra-types + # pytorch-lightning + # sqlalchemy # torch + # torchgeo # typer + # typeshed-client # typing-inspection typing-inspection==0.4.1 # via pydantic @@ -866,9 +1202,13 @@ urllib3==2.2.3 # via # blobfile # botocore + # docker + # lightly # requests # responses # tritonclient +uvicorn==0.35.0 + # via mlflow-skinny vector-quantize-pytorch==1.21.2 # via -r requirements/test.in virtualenv==20.31.2 @@ -880,11 +1220,15 @@ wcwidth==0.2.13 webcolors==24.11.1 # via jsonschema werkzeug==3.1.3 - # via schemathesis + # via + # flask + # schemathesis word2number==1.1 # via lm-eval wrapt==1.17.2 # via smart-open +xarray==2025.7.1 + # via rioxarray xxhash==3.5.0 # via # datasets @@ -893,5 +1237,7 @@ yarl==1.17.1 # via # aiohttp # schemathesis +zipp==3.23.0 + # via importlib-metadata zstandard==0.23.0 # via lm-eval diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py new file mode 100644 index 000000000000..f08d83c08212 --- /dev/null +++ b/tests/models/multimodal/pooling/test_prithvi_mae.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch + +from vllm.utils import set_default_torch_num_threads + +from ....conftest import VllmRunner + + +def generate_test_mm_data(): + mm_data = { + "pixel_values": torch.full((6, 512, 512), 1.0, dtype=torch.float16), + "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16), + } + return mm_data + + +def _run_test( + vllm_runner: type[VllmRunner], + model: str, +) -> None: + + prompt = [ + { + # This model deals with no text input + "prompt_token_ids": [1], + "multi_modal_data": generate_test_mm_data(), + } for _ in range(10) + ] + + with ( + set_default_torch_num_threads(1), + vllm_runner( + model, + task="embed", + dtype=torch.float16, + enforce_eager=True, + skip_tokenizer_init=True, + # Limit the maximum number of sequences to avoid the + # test going OOM during the warmup run + max_num_seqs=32, + ) as vllm_model, + ): + vllm_model.encode(prompt) + + +MODELS = ["christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"] + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", MODELS) +def test_models_image( + hf_runner, + vllm_runner, + image_assets, + model: str, +) -> None: + _run_test( + vllm_runner, + model, + ) diff --git a/vllm/config.py b/vllm/config.py index 223c1968c275..764472c47ef6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -651,6 +651,8 @@ def __post_init__(self) -> None: self.original_max_model_len = self.max_model_len self.max_model_len = self.get_and_verify_max_len(self.max_model_len) self.multimodal_config = self._init_multimodal_config() + self.model_supports_multimodal_raw_input = ( + self.registry.supports_multimodal_raw_input(self.architectures)) if not self.skip_tokenizer_init: self._verify_tokenizer_mode() @@ -1243,10 +1245,10 @@ def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]: return self.get_hf_config_sliding_window() def get_vocab_size(self) -> int: - return self.hf_text_config.vocab_size + return getattr(self.hf_text_config, "vocab_size", 0) def get_hidden_size(self) -> int: - return self.hf_text_config.hidden_size + return getattr(self.hf_text_config, "hidden_size", 0) @property def is_deepseek_mla(self) -> bool: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e2f8de1990b5..3081995e693f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -238,14 +238,14 @@ def __init__( self.log_stats = log_stats self.use_cached_outputs = use_cached_outputs - if not self.model_config.skip_tokenizer_init: - self.tokenizer = self._init_tokenizer() - self.detokenizer = Detokenizer(self.tokenizer) - tokenizer_group = self.get_tokenizer_group() - else: + if self.model_config.skip_tokenizer_init: self.tokenizer = None self.detokenizer = None tokenizer_group = None + else: + self.tokenizer = self._init_tokenizer() + self.detokenizer = Detokenizer(self.tokenizer) + tokenizer_group = self.get_tokenizer_group() # Ensure that the function doesn't contain a reference to self, # to avoid engine GC issues diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 8f6a7db7aa8d..957b57276b4c 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -136,6 +136,40 @@ def supports_multimodal( return getattr(model, "supports_multimodal", False) +@runtime_checkable +class SupportsMultiModalWithRawInput(SupportsMultiModal, Protocol): + """The interface required for all multi-modal models.""" + + supports_multimodal_raw_input: ClassVar[Literal[True]] = True + """ + A flag that indicates this model supports multi-modal inputs and processes + them in their raw form and not embeddings. + + Note: + There is no need to redefine this flag if this class is in the + MRO of your model class. + """ + + +@overload +def supports_multimodal_raw_input( + model: object) -> TypeIs[SupportsMultiModalWithRawInput]: + ... + + +@overload +def supports_multimodal_raw_input( + model: type[object]) -> TypeIs[type[SupportsMultiModalWithRawInput]]: + ... + + +def supports_multimodal_raw_input( + model: Union[type[object], object] +) -> Union[TypeIs[type[SupportsMultiModalWithRawInput]], + TypeIs[SupportsMultiModalWithRawInput]]: + return getattr(model, "supports_multimodal_raw_input", False) + + @runtime_checkable class SupportsScoreTemplate(Protocol): """The interface required for all models that support score template.""" diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index d51fcec07fd6..0f00fd47fe4f 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -16,6 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only IBM/NASA Prithvi Geospatial model.""" + from collections.abc import Iterable, Mapping, Sequence from typing import Optional, Union @@ -27,13 +28,14 @@ from vllm.model_executor.layers.pooler import (AllPool, PoolerHead, PoolerIdentity, SimplePooler) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import (IsAttentionFree, - SupportsMultiModal, - SupportsV0Only) +from vllm.model_executor.models.interfaces import ( + IsAttentionFree, MultiModalEmbeddings, SupportsMultiModalWithRawInput) from vllm.model_executor.models.utils import AutoWeightsLoader from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputs, MultiModalKwargs) + MultiModalFieldElem, MultiModalInputs, + MultiModalKwargs, MultiModalKwargsItem, + MultiModalSharedField, PlaceholderRange) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptUpdate) @@ -62,8 +64,9 @@ def get_dummy_mm_data( # The size of pixel_values might change in the cases where we resize # the input but never exceeds the dimensions below. return { - "pixel_values": torch.full((1, 6, 512, 512), 1.0), - "location_coords": torch.full((1, 2), 1.0), + "pixel_values": torch.full((6, 512, 512), 1.0, + dtype=torch.float16), + "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16), } @@ -75,8 +78,10 @@ def _get_mm_fields_config( hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - location_coords=MultiModalFieldConfig.batched("image"), + pixel_values=MultiModalFieldConfig.shared(batch_size=1, + modality="image"), + location_coords=MultiModalFieldConfig.shared(batch_size=1, + modality="image"), ) def _get_prompt_updates( @@ -99,23 +104,48 @@ def apply( for k, v in mm_data.items(): mm_kwargs[k] = v + mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]} + + # This model receives in input a multi-dimensional tensor representing + # a single image patch and therefore it is not to be split + # into multiple elements, but rather to be considered a single one. + # Hence, the decision of using a MultiModalSharedField. + # The expected shape is (num_channels, width, height). + + # This model however allows the user to also submit multiple image + # patches as a batch, adding a further dimension to the above shape. + # At this stage we only support submitting one patch per request and + # batching is achieved via vLLM batching. + # TODO (christian-pinto): enable support for multi patch requests + # in tandem with vLLM batching. + multimodal_kwargs_items = [ + MultiModalKwargsItem.from_elems([ + MultiModalFieldElem( + modality="image", + key=key, + data=data, + field=MultiModalSharedField(1), + ) for key, data in mm_kwargs.items() + ]) + ] return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=[1], - mm_kwargs=MultiModalKwargs(mm_kwargs), + mm_kwargs=MultiModalKwargs.from_items(multimodal_kwargs_items), mm_hashes=None, - mm_placeholders={}, + mm_placeholders=mm_placeholders, ) @MULTIMODAL_REGISTRY.register_processor( PrithviGeoSpatialMAEMultiModalProcessor, info=PrithviGeoSpatialMAEProcessingInfo, - dummy_inputs=PrithviGeoSpatialMAEInputBuilder) -class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal, - SupportsV0Only): + dummy_inputs=PrithviGeoSpatialMAEInputBuilder, +) +class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, + SupportsMultiModalWithRawInput): """Prithvi Masked Autoencoder""" is_pooling_model = True @@ -128,10 +158,10 @@ def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: raise ValueError("Only image modality is supported") def _instantiate_model(self, config: dict) -> Optional[nn.Module]: - # We might be able/need to support different tasks with this same model if config["task_args"]["task"] == "SemanticSegmentationTask": from terratorch.cli_tools import SemanticSegmentationTask + task = SemanticSegmentationTask( config["model_args"], config["task_args"]["model_factory"], @@ -144,7 +174,8 @@ def _instantiate_model(self, config: dict) -> Optional[nn.Module]: scheduler_hparams=config["scheduler_params"], plot_on_val=config["task_args"]["plot_on_val"], freeze_decoder=config["task_args"]["freeze_decoder"], - freeze_backbone=config["task_args"]["freeze_backbone"]) + freeze_backbone=config["task_args"]["freeze_backbone"], + ) return task.model else: @@ -168,12 +199,10 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): def _parse_and_validate_multimodal_data( self, **kwargs) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - pixel_values = kwargs.pop("pixel_values", None) if not isinstance(pixel_values, torch.Tensor): raise ValueError(f"Incorrect type of pixel_values. " f"Got type: {type(pixel_values)}") - pixel_values = torch.unbind(pixel_values, dim=0)[0] location_coords = kwargs.pop("location_coords", None) if not isinstance(location_coords, torch.Tensor): @@ -185,6 +214,17 @@ def _parse_and_validate_multimodal_data( return pixel_values, location_coords + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + # We do not really use any input tokens and therefore no embeddings + # to be calculated. However, due to the mandatory token ids in + # the input prompt we pass one token and the size of the dummy + # embedding tensors must reflect that. + return torch.empty((input_ids.shape[0], 0)) + def forward( self, input_ids: Optional[torch.Tensor], diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index fafb6a704383..2aaac7798fc0 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -22,8 +22,8 @@ from .interfaces import (has_inner_state, has_noops, is_attention_free, is_hybrid, supports_cross_encoding, - supports_multimodal, supports_pp, - supports_transcription, supports_v0_only) + supports_multimodal, supports_multimodal_raw_input, + supports_pp, supports_transcription, supports_v0_only) from .interfaces_base import is_text_generation_model logger = init_logger(__name__) @@ -287,6 +287,7 @@ class _ModelInfo: is_pooling_model: bool supports_cross_encoding: bool supports_multimodal: bool + supports_multimodal_raw_input: bool supports_pp: bool has_inner_state: bool is_attention_free: bool @@ -304,6 +305,7 @@ def from_model_cls(model: type[nn.Module]) -> "_ModelInfo": is_pooling_model=True, # Can convert any model into a pooling model supports_cross_encoding=supports_cross_encoding(model), supports_multimodal=supports_multimodal(model), + supports_multimodal_raw_input=supports_multimodal_raw_input(model), supports_pp=supports_pp(model), has_inner_state=has_inner_state(model), is_attention_free=is_attention_free(model), @@ -573,6 +575,13 @@ def is_multimodal_model( model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_multimodal + def supports_multimodal_raw_input( + self, + architectures: Union[str, list[str]], + ) -> bool: + model_cls, _ = self.inspect_model_cls(architectures) + return model_cls.supports_multimodal_raw_input + def is_pp_supported_model( self, architectures: Union[str, list[str]], diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 27aaa661c35c..c44fcacd246c 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -266,7 +266,7 @@ def create_processor( if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") - if tokenizer is None: + if tokenizer is None and not model_config.skip_tokenizer_init: tokenizer = cached_tokenizer_from_config(model_config) if disable_cache is None: mm_config = model_config.get_multimodal_config() diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 79b5d5ae4a23..95a474228d4f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -94,11 +94,14 @@ def __init__( self.log_requests = log_requests self.log_stats = log_stats - # Tokenizer (+ ensure liveness if running in another process). - self.tokenizer = init_tokenizer_from_configs( - model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - lora_config=vllm_config.lora_config) + if self.model_config.skip_tokenizer_init: + self.tokenizer = None + else: + # Tokenizer (+ ensure liveness if running in another process). + self.tokenizer = init_tokenizer_from_configs( + model_config=vllm_config.model_config, + scheduler_config=vllm_config.scheduler_config, + lora_config=vllm_config.lora_config) # Processor (converts Inputs --> EngineCoreRequests). self.processor = Processor( @@ -525,6 +528,10 @@ async def get_tokenizer( self, lora_request: Optional[LoRARequest] = None, ) -> AnyTokenizer: + if self.tokenizer is None: + raise ValueError("Unable to get tokenizer because " + "skip_tokenizer_init is True") + return self.tokenizer.get_lora_tokenizer(lora_request) async def is_tracing_enabled(self) -> bool: diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index a2328c37ba0c..29aca1ad698e 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -82,11 +82,14 @@ def __init__( self.dp_group = None self.should_execute_dummy_batch = False - # Tokenizer (+ ensure liveness if running in another process). - self.tokenizer = init_tokenizer_from_configs( - model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - lora_config=vllm_config.lora_config) + if self.model_config.skip_tokenizer_init: + self.tokenizer = None + else: + # Tokenizer (+ ensure liveness if running in another process). + self.tokenizer = init_tokenizer_from_configs( + model_config=vllm_config.model_config, + scheduler_config=vllm_config.scheduler_config, + lora_config=vllm_config.lora_config) # Processor (convert Inputs --> EngineCoreRequests) self.processor = Processor(vllm_config=vllm_config, diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 2bcd61d1f0aa..3be6c4821214 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -327,14 +327,16 @@ def add_request( if request_id in self.request_states: raise ValueError(f"Request id {request_id} already running.") - req_state = RequestState.from_new_request( - tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request), - request=request, - prompt=prompt, - parent_req=parent_req, - request_index=request_index, - queue=queue, - log_stats=self.log_stats) + tokenizer = None if not self.tokenizer else \ + self.tokenizer.get_lora_tokenizer(request.lora_request) + + req_state = RequestState.from_new_request(tokenizer=tokenizer, + request=request, + prompt=prompt, + parent_req=parent_req, + request_index=request_index, + queue=queue, + log_stats=self.log_stats) self.request_states[request_id] = req_state self.lora_states.add_request(req_state) if parent_req: diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 7af4ed54a220..725152f978d6 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -380,7 +380,6 @@ def _validate_model_input( prompt_type: Literal["encoder", "decoder"], ): model_config = self.model_config - tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) prompt_ids = prompt_inputs["prompt_token_ids"] if not prompt_ids: @@ -389,9 +388,14 @@ def _validate_model_input( else: raise ValueError(f"The {prompt_type} prompt cannot be empty") - max_input_id = max(prompt_ids, default=0) - if max_input_id > tokenizer.max_token_id: - raise ValueError(f"Token id {max_input_id} is out of vocabulary") + if self.model_config.skip_tokenizer_init: + tokenizer = None + else: + tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) + max_input_id = max(prompt_ids, default=0) + if max_input_id > tokenizer.max_token_id: + raise ValueError( + f"Token id {max_input_id} is out of vocabulary") max_prompt_len = self.model_config.max_model_len if len(prompt_ids) > max_prompt_len: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6c91ba2324a2..247c26e76178 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -126,6 +126,8 @@ def __init__( self.is_multimodal_model = model_config.is_multimodal_model self.is_pooling_model = model_config.pooler_config is not None + self.model_supports_multimodal_raw_input = ( + model_config.model_supports_multimodal_raw_input) self.max_model_len = model_config.max_model_len self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_reqs = scheduler_config.max_num_seqs @@ -328,6 +330,14 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: Args: scheduler_output: The scheduler output. """ + # Attention free models have zero kv_cache_goups, however models + # like Mamba are also attention free but use the kv_cache for + # keeping its internal state. This is why we check the number + # of kv_cache groups instead of solely checking + # for self.model_config.is_attention_free. + if len(self.kv_cache_config.kv_cache_groups) == 0: + return + self.attn_metadata_builders[0].reorder_batch(self.input_batch, scheduler_output) @@ -565,6 +575,38 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Refresh batch metadata with any pending updates. self.input_batch.refresh_metadata() + def _init_model_kwargs_for_multimodal_model( + self, + scheduler_output: Optional["SchedulerOutput"] = None, + num_reqs: int = -1, + ) -> dict[str, Any]: + + model_kwargs: dict[str, Any] = {} + if self.model_supports_multimodal_raw_input: + # This model requires the raw multimodal data in input. + if scheduler_output: + multi_modal_kwargs_list = [] + for req in scheduler_output.scheduled_new_reqs: + req_mm_inputs = req.mm_inputs + if not isinstance(req_mm_inputs, list): + req_mm_inputs = list(req_mm_inputs) + multi_modal_kwargs_list.extend(req_mm_inputs) + multi_modal_kwargs = MultiModalKwargs.batch( + multi_modal_kwargs_list) + else: + # The only case where SchedulerOutput is None is for + # a dummy run let's get some dummy data. + dummy_data = [ + self.mm_registry.get_decoder_dummy_data( + model_config=self.model_config, + seq_len=1).multi_modal_data for i in range(num_reqs) + ] + multi_modal_kwargs = MultiModalKwargs.batch(dummy_data) + + model_kwargs.update(multi_modal_kwargs) + + return model_kwargs + def _get_cumsum_and_arange( self, num_tokens: np.ndarray, @@ -1359,10 +1401,14 @@ def execute_model( # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. input_ids = self.input_ids[:num_scheduled_tokens] + + model_kwargs = self._init_model_kwargs_for_multimodal_model( + scheduler_output=scheduler_output) inputs_embeds = self.model.get_input_embeddings( input_ids=input_ids, multimodal_embeddings=mm_embeds or None, ) + # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) inputs_embeds = self.inputs_embeds[:num_input_tokens] @@ -1374,6 +1420,7 @@ def execute_model( # then the embedding layer is not included in the CUDA graph. input_ids = self.input_ids[:num_input_tokens] inputs_embeds = None + model_kwargs = {} if self.uses_mrope: positions = self.mrope_positions[:, :num_input_tokens] else: @@ -1406,6 +1453,10 @@ def execute_model( positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, + **MultiModalKwargs.as_kwargs( + model_kwargs, + device=self.device, + ), ) self.maybe_wait_for_kv_save() @@ -2084,11 +2135,15 @@ def _dummy_run( num_scheduled_tokens): model = self.model if self.is_multimodal_model: + model_kwargs = self._init_model_kwargs_for_multimodal_model( + num_reqs=num_reqs) input_ids = None inputs_embeds = self.inputs_embeds[:num_tokens] else: input_ids = self.input_ids[:num_tokens] inputs_embeds = None + model_kwargs = {} + if self.uses_mrope: positions = self.mrope_positions[:, :num_tokens] else: @@ -2117,7 +2172,12 @@ def _dummy_run( positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, + **MultiModalKwargs.as_kwargs( + model_kwargs, + device=self.device, + ), ) + if self.use_aux_hidden_state_outputs: hidden_states, _ = outputs else: From 49b48be727f3ff051a1cde28083f37fc04bee8aa Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Wed, 23 Jul 2025 11:00:47 -0700 Subject: [PATCH 62/63] Add test case for compiling multiple graphs (#21044) Signed-off-by: Yong Hoon Shin Signed-off-by: qizixi --- .../compile/piecewise/test_multiple_graphs.py | 350 ++++++++++++++++++ vllm/compilation/compiler_interface.py | 6 + vllm/compilation/decorators.py | 35 +- 3 files changed, 390 insertions(+), 1 deletion(-) create mode 100644 tests/compile/piecewise/test_multiple_graphs.py diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py new file mode 100644 index 000000000000..e460d7095178 --- /dev/null +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -0,0 +1,350 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Test (piecewise) compilation with a simple model where multiple submodules +are compiled and graph captured separately. +""" +import torch +from torch import nn +from torch.library import Library + +from vllm.compilation.backends import set_model_tag +from vllm.compilation.counter import compilation_counter +from vllm.compilation.decorators import (ignore_torch_compile, + support_torch_compile) +from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, + set_current_vllm_config) +from vllm.envs import VLLM_USE_V1 +from vllm.forward_context import set_forward_context +from vllm.utils import direct_register_custom_op + +# create a library to hold the custom op +silly_lib = Library("silly", "FRAGMENT") # noqa + +BATCH_SIZE = 32 +MLP_SIZE = 128 +HIDDEN_SIZE = 1024 +RANDOM_SEED = 0 + + +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + out.copy_(q) + out += k + out += v + + +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + return + + +direct_register_custom_op( + op_name="attention", + op_func=silly_attention, + mutates_args=["out"], + fake_impl=silly_attention_fake, + target_lib=silly_lib, +) + + +@support_torch_compile +class ParentModel(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + + +class Attention(nn.Module): + + def __init__(self, mlp_size: int, hidden_size: int) -> None: + super().__init__() + self.pre_attn = nn.Linear(mlp_size, hidden_size, bias=False) + self.post_attn = nn.Linear(hidden_size, mlp_size, bias=False) + self.rms_norm_weight = nn.Parameter(torch.ones(hidden_size)) + + # Initialize to same weights for testing + nn.init.xavier_normal_( + self.pre_attn.weight.data, + generator=torch.Generator().manual_seed(RANDOM_SEED), + gain=0.001) + nn.init.xavier_normal_( + self.post_attn.weight.data, + generator=torch.Generator().manual_seed(RANDOM_SEED), + gain=0.001) + + def rms_norm_ref(self, x: torch.Tensor) -> torch.Tensor: + x_f32 = x.float() + return (x_f32 * torch.rsqrt( + torch.mean(x_f32.square(), dim=-1, keepdim=True) + 1e-6) * + self.rms_norm_weight).to(x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.pre_attn(x) + x = self.rms_norm_ref(x) + attn_output = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, attn_output) + x = attn_output + x = self.rms_norm_ref(x) + x = self.post_attn(x) + return x + + +@support_torch_compile +class CompiledAttention(nn.Module): + + def __init__(self, + *, + mlp_size: int, + hidden_size: int, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + self.attn = Attention(mlp_size, hidden_size) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.attn(x) + + +@support_torch_compile +class CompiledAttentionTwo(CompiledAttention): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.attn(x) + x + + +@ignore_torch_compile +class SimpleModelWithTwoGraphs(ParentModel): + + def __init__(self, + *, + mlp_size: int, + hidden_size: int, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__(vllm_config=vllm_config, prefix=prefix) + # Test will fail without set_model_tag here with error: + # "ValueError: too many values to unpack (expected 3)" + # This is because CompiledAttention and CompiledAttentionTwo + # have different implmentations but the same torch.compile + # cache dir will be used as default prefix is 'model_tag' + with set_model_tag("attn_one"): + self.attn_one = CompiledAttention( + mlp_size=mlp_size, + hidden_size=hidden_size, + vllm_config=vllm_config, + prefix=f"{prefix}.attn_one", + ) + with set_model_tag("attn_two"): + self.attn_two = CompiledAttentionTwo( + mlp_size=mlp_size, + hidden_size=hidden_size, + vllm_config=vllm_config, + prefix=f"{prefix}.attn_two", + ) + + self.hidden_states = torch.zeros((BATCH_SIZE, MLP_SIZE)).cuda() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + bsz = x.shape[0] + # CUDAGraph expects same tensor addresses for each run + self.hidden_states[:bsz].copy_(x) + x = self.attn_one(self.hidden_states[:bsz]) + self.hidden_states[:bsz].copy_(x) + x = self.attn_two(self.hidden_states[:bsz]) + return x + + +def test_ignore_torch_compile_decorator(): + assert VLLM_USE_V1 + + # piecewise + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_capture_sizes=[1, 2], + )) + + @support_torch_compile + class A(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + x + attn_output = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, attn_output) + x = attn_output + x = x * 3 + return x + + @ignore_torch_compile + class B(A): + ... + + @support_torch_compile + class C(B): + ... + + with set_current_vllm_config(vllm_config): + mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() + + # A has support_torch_compile + with compilation_counter.expect( + num_graphs_seen=1, + num_piecewise_graphs_seen=3, + num_piecewise_capturable_graphs_seen=2, + num_backend_compilations=2, + num_cudagraph_captured=4, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ), set_forward_context({}, vllm_config=vllm_config): + # first run is for compile + mod_A(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) + # run cudagraph captured sizes + mod_A(torch.randn(2, MLP_SIZE).cuda()) + mod_A(torch.randn(1, MLP_SIZE).cuda()) + + with set_current_vllm_config(vllm_config): + mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda() + + # B's ignore_torch_compile should override A's support_torch_compile + with compilation_counter.expect( + num_graphs_seen=0, + num_piecewise_graphs_seen=0, + num_piecewise_capturable_graphs_seen=0, + num_backend_compilations=0, + num_cudagraph_captured=0, + ), set_forward_context({}, vllm_config=vllm_config): + mod_B(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) + mod_B(torch.randn(2, MLP_SIZE).cuda()) + mod_B(torch.randn(1, MLP_SIZE).cuda()) + + with set_current_vllm_config(vllm_config): + mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda() + + # C's support_torch_compile should override B's ignore_torch_compile + with compilation_counter.expect( + num_graphs_seen=1, + num_piecewise_graphs_seen=3, + num_piecewise_capturable_graphs_seen=2, + num_backend_compilations=2, + num_cudagraph_captured=4, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ), set_forward_context({}, vllm_config=vllm_config): + mod_C(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) + mod_C(torch.randn(2, MLP_SIZE).cuda()) + mod_C(torch.randn(1, MLP_SIZE).cuda()) + + +@torch.inference_mode +def run_model(vllm_config, model: nn.Module, inputs: torch.Tensor): + with set_forward_context({}, vllm_config=vllm_config): + # First run is for compile + model(inputs) + + # Run CUDAGraph captured sizes + model(inputs[:2]) + model(inputs[:1]) + + output = model(inputs[:2]) + + output = output.cpu() + return output.cpu() + + +def test_multi_graph_piecewise_compile_outputs_equal(): + outputs = [] + + # piecewise compile + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_capture_sizes=[1, 2], + )) + + with set_current_vllm_config(vllm_config): + model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, + hidden_size=HIDDEN_SIZE, + vllm_config=vllm_config, + prefix='').eval().cuda() + + # Pre-allocate memory for CUDAGraph which expects + # static tensor addresses + inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda() + + with compilation_counter.expect( + num_graphs_seen=2, # two graphs for the model + num_piecewise_graphs_seen=6, + # attn_one, attn_two each has 3 piecewise graphs + # (pre attn, post attn, silly_attention) each + num_piecewise_capturable_graphs_seen=4, + # attn_one, attn_two has pre attn and post attn each, total=4 + num_backend_compilations=4, # num_piecewise_capturable_graphs_seen + num_cudagraph_captured=8, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + outputs.append(run_model(vllm_config, model, inputs)) + + # no compile or cudagraph + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.NO_COMPILATION, )) + + with set_current_vllm_config(vllm_config): + model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, + hidden_size=HIDDEN_SIZE, + vllm_config=vllm_config, + prefix='').eval().cuda() + + with compilation_counter.expect( + num_graphs_seen=0, + num_piecewise_graphs_seen=0, + num_piecewise_capturable_graphs_seen=0, + num_backend_compilations=0, + num_cudagraph_captured=0, + ): + outputs.append(run_model(vllm_config, model, inputs)) + + # piecewise compile without CUDA graph + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=False, + splitting_ops=["silly.attention"], + )) + + with set_current_vllm_config(vllm_config): + model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, + hidden_size=HIDDEN_SIZE, + vllm_config=vllm_config, + prefix='').eval().cuda() + + with compilation_counter.expect( + num_graphs_seen=2, + num_piecewise_graphs_seen=6, + num_piecewise_capturable_graphs_seen=4, + num_backend_compilations=4, + num_cudagraph_captured=0, # no cudagraph captured + ): + outputs.append(run_model(vllm_config, model, inputs)) + + # Generally don't expect outputs with and without inductor + # to be bitwise equivalent + assert torch.allclose(outputs[0], outputs[1]) + + # Expect bitwise equivalence using inductor w/ and w/o cudagraph + assert torch.equal(outputs[0], outputs[2]) diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index b529f84b7987..7158fd685964 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -423,6 +423,12 @@ def _get_shape_env() -> AlwaysHitShapeEnv: if is_torch_equal_or_newer("2.6"): stack.enter_context( torch._inductor.config.patch(fx_graph_remote_cache=False)) + # InductorAdaptor (unfortunately) requires AOTAutogradCache + # to be turned off to run. It will fail to acquire the hash_str + # and error if not. + # StandaloneInductorAdaptor (PyTorch 2.8+) fixes this problem. + stack.enter_context( + torch._functorch.config.patch(enable_autograd_cache=False)) stack.enter_context( torch._functorch.config.patch( enable_remote_autograd_cache=False)) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 05e4ca9f08b3..f3592324d8cf 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -20,9 +20,38 @@ logger = init_logger(__name__) +IGNORE_COMPILE_KEY = "_ignore_compile_vllm" + _T = TypeVar("_T", bound=type[nn.Module]) +def ignore_torch_compile(cls: _T) -> _T: + """ + A decorator to ignore support_torch_compile decorator + on the class. This is useful when a parent class has + a support_torch_compile decorator, but we don't want to + compile the class `cls` that inherits the parent class. + This only ignores compiling the forward of the class the + decorator is applied to. + + If the parent has ignore_torch_compile but the child has + support_torch_compile, the child will still be compiled. + + If the class has one or more submodules + that have support_torch_compile decorator applied, compile will + not be ignored for those submodules. + """ + setattr(cls, IGNORE_COMPILE_KEY, True) + return cls + + +def _should_ignore_torch_compile(cls) -> bool: + """ + Check if the class should be ignored for torch.compile. + """ + return getattr(cls, IGNORE_COMPILE_KEY, False) + + @overload def support_torch_compile( *, @@ -148,6 +177,8 @@ def _support_torch_compile( old_init = cls.__init__ + setattr(cls, IGNORE_COMPILE_KEY, False) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) self.vllm_config = vllm_config @@ -156,9 +187,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): self.do_not_compile = \ vllm_config.compilation_config.level in [ CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS - ] or not supports_dynamo() + ] or not supports_dynamo() or _should_ignore_torch_compile( + self.__class__) if self.do_not_compile: return + compilation_counter.num_models_seen += 1 TorchCompileWrapperWithCustomDispatcher.__init__( self, compilation_level=vllm_config.compilation_config.level) From f08230a9ad7942d75a194ebe8f47af56c090c29a Mon Sep 17 00:00:00 2001 From: QiliangCui Date: Wed, 23 Jul 2025 11:29:36 -0700 Subject: [PATCH 63/63] [TPU][TEST] Fix the downloading issue in TPU v1 test 11. (#21418) Signed-off-by: Qiliang Cui Signed-off-by: qizixi --- .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 60f0d174bd6c..d39acae0b043 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -62,7 +62,8 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 + && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \ + && python3 -m pip install --progress-bar off hf-transfer echo "--- Python dependencies installed ---" export VLLM_USE_V1=1 export VLLM_XLA_CHECK_RECOMPILATION=1 @@ -150,7 +151,7 @@ run_and_track_test 9 "test_multimodal.py" \ run_and_track_test 10 "test_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" run_and_track_test 11 "test_struct_output_generate.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" + "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" run_and_track_test 12 "test_moe_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" run_and_track_test 13 "test_lora.py" \