Skip to content

Commit 0b360f5

Browse files
committed
Merge remote-tracking branch 'upstream/main' into dcp-dev
Signed-off-by: QiuChunshuo <[email protected]>
2 parents c172a1a + 4ab34f6 commit 0b360f5

File tree

401 files changed

+17407
-7219
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

401 files changed

+17407
-7219
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ steps:
132132
queue: cpu_queue_postmerge
133133
commands:
134134
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
135-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
135+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
136136
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
137137
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
138138
env:

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,17 +78,13 @@ HF_MOUNT="/root/.cache/huggingface"
7878
commands=$@
7979
echo "Commands:$commands"
8080

81-
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
82-
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
83-
fi
81+
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
8482

8583
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
8684
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
8785
fi
8886

89-
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
90-
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
91-
fi
87+
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
9288

9389
if [[ $commands == *"pytest -v -s lora"* ]]; then
9490
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ function cpu_tests() {
4949
# Run kernel tests
5050
docker exec cpu-test-"$NUMA_NODE" bash -c "
5151
set -e
52+
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
5253
pytest -x -v -s tests/kernels/test_onednn.py"
5354

5455
# Run basic model test
@@ -76,7 +77,7 @@ function cpu_tests() {
7677
# Run AWQ test
7778
# docker exec cpu-test-"$NUMA_NODE" bash -c "
7879
# set -e
79-
# VLLM_USE_V1=0 pytest -x -s -v \
80+
# pytest -x -s -v \
8081
# tests/quantization/test_ipex_quant.py"
8182

8283
# Run multi-lora tests
@@ -116,4 +117,4 @@ function cpu_tests() {
116117

117118
# All of CPU tests are expected to be finished less than 40 mins.
118119
export -f cpu_tests
119-
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
120+
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/test-amd.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ steps:
348348
- pytest -v -s -m 'not cpu_test' v1/metrics
349349
- pytest -v -s v1/test_oracle.py
350350
- pytest -v -s v1/test_request.py
351+
- pytest -v -s v1/test_outputs.py
351352
# Integration test for streaming correctness (requires special branch).
352353
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
353354
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

.buildkite/test-pipeline.yaml

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
2626
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
2727
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
28+
# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch.
2829

2930
# When adding a test
3031
# - If the test belongs to an existing group, add it there
@@ -56,7 +57,7 @@ steps:
5657
- pytest -v -s -m 'not cpu_test' multimodal
5758
- pytest -v -s utils_
5859

59-
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
60+
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
6061
timeout_in_minutes: 10
6162
source_file_dependencies:
6263
- vllm/
@@ -65,13 +66,15 @@ steps:
6566
- tests/multimodal
6667
- tests/standalone_tests/lazy_imports.py
6768
- tests/transformers_utils
69+
- tests/config
6870
no_gpu: true
6971
commands:
7072
- python3 standalone_tests/lazy_imports.py
7173
- pytest -v -s test_inputs.py
7274
- pytest -v -s test_outputs.py
7375
- pytest -v -s -m 'cpu_test' multimodal
7476
- pytest -v -s transformers_utils
77+
- pytest -v -s config
7578

7679
- label: Python-only Installation Test # 10min
7780
timeout_in_minutes: 20
@@ -329,6 +332,7 @@ steps:
329332
- pytest -v -s -m 'not cpu_test' v1/metrics
330333
- pytest -v -s v1/test_oracle.py
331334
- pytest -v -s v1/test_request.py
335+
- pytest -v -s v1/test_outputs.py
332336
# Integration test for streaming correctness (requires special branch).
333337
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
334338
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -441,6 +445,7 @@ steps:
441445
- vllm/
442446
- tests/compile
443447
commands:
448+
- pytest -v -s compile/test_config.py
444449
- pytest -v -s compile/test_pass_manager.py
445450
- pytest -v -s compile/test_fusion.py
446451
- pytest -v -s compile/test_fusion_attn.py
@@ -450,6 +455,7 @@ steps:
450455
- pytest -v -s compile/test_decorator.py
451456
- pytest -v -s compile/test_noop_elimination.py
452457
- pytest -v -s compile/test_aot_compile.py
458+
- pytest -v -s compile/test_qk_norm_rope_fusion.py
453459

454460
- label: PyTorch Fullgraph Smoke Test # 15min
455461
timeout_in_minutes: 30
@@ -604,6 +610,7 @@ steps:
604610
source_file_dependencies:
605611
- csrc/
606612
- vllm/model_executor/layers/quantization
613+
autorun_on_main: true
607614
commands:
608615
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
609616

@@ -867,12 +874,12 @@ steps:
867874
optional: true
868875
commands:
869876
- pip install --upgrade git+https://github.com/huggingface/transformers
870-
- pytest -v -s tests/models/test_initialization.py
877+
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
871878
- pytest -v -s tests/models/test_transformers.py
872-
- pytest -v -s tests/models/multimodal/processing/
873-
- pytest -v -s tests/models/multimodal/test_mapping.py
879+
# - pytest -v -s tests/models/multimodal/processing/
880+
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
874881
- python3 examples/offline_inference/basic/chat.py
875-
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
882+
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
876883
# Whisper needs spawn method to avoid deadlock
877884
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
878885

@@ -937,7 +944,7 @@ steps:
937944
# this runner has 2 GPUs available even though num_gpus=2 is not set
938945
- pytest -v -s tests/compile/test_fusion_all_reduce.py
939946
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
940-
# Wrap with quotes to escape yaml
947+
# Wrap with quotes to escape yaml
941948
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
942949

943950
- label: Blackwell Fusion E2E Tests # 30 min

.github/CODEOWNERS

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,16 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
6161
/vllm/model_executor/models/transformers @hmellor
6262
/tests/models/test_transformers.py @hmellor
6363

64+
# Observability
65+
/vllm/config/observability.py @markmc
66+
/vllm/v1/metrics @markmc
67+
/tests/v1/metrics @markmc
68+
/vllm/tracing.py @markmc
69+
/tests/v1/tracing/test_tracing.py @markmc
70+
/vllm/config/kv_events.py @markmc
71+
/vllm/distributed/kv_events.py @markmc
72+
/tests/distributed/test_events.py @markmc
73+
6474
# Docs
6575
/docs/mkdocs @hmellor
6676
/docs/**/*.yml @hmellor

CMakeLists.txt

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,13 @@ set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
3939
# Supported AMD GPU architectures.
4040
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
4141

42+
# ROCm installation prefix. Default to /opt/rocm but allow override via
43+
# -DROCM_PATH=/your/rocm/path when invoking cmake.
44+
if(NOT DEFINED ROCM_PATH)
45+
set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm installation prefix")
46+
else()
47+
set(ROCM_PATH ${ROCM_PATH} CACHE PATH "ROCm installation prefix" FORCE)
48+
endif()
4249
#
4350
# Supported/expected torch versions for CUDA/ROCm.
4451
#
@@ -237,10 +244,27 @@ set_gencode_flags_for_srcs(
237244
SRCS "${VLLM_CUMEM_EXT_SRC}"
238245
CUDA_ARCHS "${CUDA_ARCHS}")
239246

240-
if(VLLM_GPU_LANG STREQUAL "CUDA")
247+
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
241248
message(STATUS "Enabling cumem allocator extension.")
242-
# link against cuda driver library
243-
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
249+
if(VLLM_GPU_LANG STREQUAL "CUDA")
250+
# link against cuda driver library
251+
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
252+
else()
253+
# link against rocm driver library. Prefer an absolute path to
254+
# libamdhip64.so inside ${ROCM_PATH}/lib if available, otherwise fall
255+
# back to linking by name "amdhip64".
256+
find_library(AMDHIP64_LIB
257+
NAMES amdhip64 libamdhip64.so
258+
PATHS ${ROCM_PATH}/lib
259+
NO_DEFAULT_PATH)
260+
if(AMDHIP64_LIB)
261+
message(STATUS "Found libamdhip64 at ${AMDHIP64_LIB}")
262+
list(APPEND CUMEM_LIBS ${AMDHIP64_LIB})
263+
else()
264+
message(WARNING "libamdhip64 not found in ${ROCM_PATH}/lib; falling back to linking 'amdhip64' by name")
265+
list(APPEND CUMEM_LIBS amdhip64)
266+
endif()
267+
endif()
244268
define_extension_target(
245269
cumem_allocator
246270
DESTINATION vllm
@@ -265,6 +289,7 @@ set(VLLM_EXT_SRC
265289
"csrc/pos_encoding_kernels.cu"
266290
"csrc/activation_kernels.cu"
267291
"csrc/layernorm_kernels.cu"
292+
"csrc/fused_qknorm_rope_kernel.cu"
268293
"csrc/layernorm_quant_kernels.cu"
269294
"csrc/sampler.cu"
270295
"csrc/cuda_view.cu"
@@ -330,7 +355,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
330355
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
331356
# are not supported by Machete yet.
332357
# 9.0 for latest bf16 atomicAdd PTX
333-
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
358+
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
334359
if (MARLIN_ARCHS)
335360

336361
#
@@ -914,7 +939,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
914939

915940
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
916941
# 9.0 for latest bf16 atomicAdd PTX
917-
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
942+
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
918943
if (MARLIN_MOE_ARCHS)
919944

920945
#

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
2121

2222
*Latest News* 🔥
2323

24+
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
2425
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
2526
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
2627
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).

benchmarks/kernels/bench_block_fp8_gemm.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4+
import os
5+
6+
# Disable DeepGEMM for this benchmark to use CUTLASS
7+
os.environ["VLLM_USE_DEEP_GEMM"] = "0"
8+
49
import torch
510

611
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
7-
apply_w8a8_block_fp8_linear,
12+
W8A8BlockFp8LinearOp,
13+
)
14+
from vllm.model_executor.layers.quantization.utils.quant_utils import (
15+
GroupShape,
816
)
917
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
1018
CUTLASS_BLOCK_FP8_SUPPORTED,
@@ -39,13 +47,14 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
3947
fp8_info = torch.finfo(torch.float8_e4m3fn)
4048
fp8_max, fp8_min = fp8_info.max, fp8_info.min
4149

42-
# Create random FP8 tensors
50+
# Create random input tensor (bfloat16, will be quantized by W8A8BlockFp8LinearOp)
4351
A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
4452

53+
# Create quantized weight tensor
4554
B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
4655
B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
4756

48-
# Create scales
57+
# Create weight scales
4958
block_n, block_k = block_size[0], block_size[1]
5059
n_tiles = (N + block_n - 1) // block_n
5160
k_tiles = (K + block_k - 1) // block_k
@@ -55,19 +64,25 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
5564
* factor_for_scale
5665
)
5766

58-
# SM90 CUTLASS requires row-major format for scales
59-
if use_cutlass and current_platform.is_device_capability(90):
60-
Bs = Bs.T.contiguous()
67+
# Create W8A8BlockFp8LinearOp instance
68+
weight_group_shape = GroupShape(block_n, block_k)
69+
act_quant_group_shape = GroupShape(1, block_k) # Per-token, per-group quantization
70+
71+
linear_op = W8A8BlockFp8LinearOp(
72+
weight_group_shape=weight_group_shape,
73+
act_quant_group_shape=act_quant_group_shape,
74+
cutlass_block_fp8_supported=use_cutlass,
75+
use_aiter_and_is_supported=False,
76+
)
6177

6278
def run():
63-
if use_cutlass:
64-
return apply_w8a8_block_fp8_linear(
65-
A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
66-
)
67-
else:
68-
return apply_w8a8_block_fp8_linear(
69-
A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
70-
)
79+
return linear_op.apply(
80+
input=A_ref,
81+
weight=B,
82+
weight_scale=Bs,
83+
input_scale=None,
84+
bias=None,
85+
)
7186

7287
return run
7388

0 commit comments

Comments
 (0)