diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
index a60a4194e9b3..21ffa1b9b8d7 100644
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -8,6 +8,7 @@ run_all_patterns:
- "CMakeLists.txt"
- "requirements/common.txt"
- "requirements/cuda.txt"
+ - "requirements/kv_connectors.txt"
- "requirements/build/cuda.txt"
- "requirements/test/cuda.txt"
- "setup.py"
diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml
index 9b1044443780..6189d5d61e6a 100644
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -12,15 +12,19 @@ steps:
- vllm/_custom_ops.py
- tests/kernels/attention/test_cpu_attn.py
- tests/kernels/moe/test_cpu_fused_moe.py
+ - tests/kernels/moe/test_cpu_quant_fused_moe.py
- tests/kernels/test_onednn.py
- tests/kernels/test_awq_int4_to_int8.py
+ - tests/kernels/quantization/test_cpu_fp8_scaled_mm.py
commands:
- |
- bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+ bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
+ pytest -x -v -s tests/kernels/moe/test_cpu_quant_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py
- pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py"
+ pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py
+ pytest -x -v -s tests/kernels/quantization/test_cpu_fp8_scaled_mm.py"
- label: CPU-Compatibility Tests
depends_on: []
@@ -57,23 +61,24 @@ steps:
source_file_dependencies:
- csrc/cpu/
- vllm/model_executor/layers/quantization/cpu_wna16.py
- - vllm/model_executor/layers/quantization/gptq_marlin.py
+ - vllm/model_executor/layers/quantization/auto_gptq.py
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
- vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
- vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
+ - vllm/model_executor/layers/fused_moe/experts/cpu_moe.py
- tests/quantization/test_compressed_tensors.py
- tests/quantization/test_cpu_wna16.py
commands:
- |
- bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+ bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
pytest -x -v -s tests/quantization/test_cpu_wna16.py"
-- label: CPU-Distributed Tests
+- label: CPU-Distributed Tests (PP+TP)
depends_on: []
device: intel_cpu
no_plugin: true
- source_file_dependencies:
+ source_file_dependencies: &cpu_distributed_deps
- csrc/cpu/shm.cpp
- vllm/v1/worker/cpu_worker.py
- vllm/v1/worker/gpu_worker.py
@@ -82,10 +87,21 @@ steps:
- vllm/platforms/cpu.py
- vllm/distributed/parallel_state.py
- vllm/distributed/device_communicators/cpu_communicator.py
+ - .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+ commands:
+ - |
+ bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
+ bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh tp_pp"
+
+- label: CPU-Distributed Tests (DP+TP)
+ depends_on: []
+ device: intel_cpu
+ no_plugin: true
+ source_file_dependencies: *cpu_distributed_deps
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
- bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
+ bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh dp_tp"
- label: CPU-Multi-Modal Model Tests %N
depends_on: []
diff --git a/.buildkite/hardware_tests/intel.yaml b/.buildkite/hardware_tests/intel.yaml
index ba0088b3af62..d70ce28428d4 100644
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -8,10 +8,3 @@ steps:
commands:
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
- - label: "Intel GPU Test"
- depends_on: []
- soft_fail: true
- device: intel_gpu
- no_plugin: true
- commands:
- - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
index 00ae34bba6d7..10c03c3e1773 100755
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -192,6 +192,7 @@ export BUILDKITE_COMMIT
export PARENT_COMMIT
export IMAGE_TAG
export IMAGE_TAG_LATEST
+export COMMIT="${COMMIT:-${BUILDKITE_COMMIT}}"
export CACHE_FROM
export CACHE_FROM_BASE_BRANCH
export CACHE_FROM_MAIN
diff --git a/.buildkite/image_build/image_build_torch_nightly.sh b/.buildkite/image_build/image_build_torch_nightly.sh
index a23c658d46b9..cbd08aa7bd0b 100755
--- a/.buildkite/image_build/image_build_torch_nightly.sh
+++ b/.buildkite/image_build/image_build_torch_nightly.sh
@@ -46,7 +46,7 @@ echo "Image not found, proceeding with build..."
# --- CUDA 13.0 for nightly builds ---
# Nightly CI uses CUDA 13.0 while regular CI stays on CUDA 12.9
-NIGHTLY_CUDA_VERSION="13.0.0"
+NIGHTLY_CUDA_VERSION="13.0.2"
NIGHTLY_BUILD_BASE_IMAGE="nvidia/cuda:${NIGHTLY_CUDA_VERSION}-devel-ubuntu22.04"
NIGHTLY_FINAL_BASE_IMAGE="nvidia/cuda:${NIGHTLY_CUDA_VERSION}-base-ubuntu22.04"
diff --git a/.buildkite/intel_jobs/engine_intel.yaml b/.buildkite/intel_jobs/engine_intel.yaml
new file mode 100644
index 000000000000..c66576d40991
--- /dev/null
+++ b/.buildkite/intel_jobs/engine_intel.yaml
@@ -0,0 +1,21 @@
+group: Engine Intel
+depends_on:
+ - image-build-xpu
+steps:
+- label: Engine (1 GPU)
+ timeout_in_minutes: 30
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/v1/engine/
+ - tests/v1/engine/
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py'
diff --git a/.buildkite/intel_jobs/kernels_intel.yaml b/.buildkite/intel_jobs/kernels_intel.yaml
new file mode 100644
index 000000000000..66a8db25f02e
--- /dev/null
+++ b/.buildkite/intel_jobs/kernels_intel.yaml
@@ -0,0 +1,21 @@
+group: Kernels Intel
+depends_on:
+ - image-build-xpu
+steps:
+- label: vLLM IR Tests
+ timeout_in_minutes: 30
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/ir
+ - vllm/kernels
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ pytest -v -s kernels/ir'
diff --git a/.buildkite/intel_jobs/lora_intel.yaml b/.buildkite/intel_jobs/lora_intel.yaml
new file mode 100644
index 000000000000..32a56ef59b3f
--- /dev/null
+++ b/.buildkite/intel_jobs/lora_intel.yaml
@@ -0,0 +1,135 @@
+group: LoRA Intel
+depends_on:
+ - image-build-xpu
+steps:
+- label: LoRA Runtime + Utils
+ timeout_in_minutes: 45
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ pytest -v -s lora/test_layers.py &&
+ pytest -v -s lora/test_lora_checkpoints.py &&
+ pytest -v -s lora/test_lora_functions.py &&
+ pytest -v -s lora/test_lora_huggingface.py &&
+ pytest -v -s lora/test_lora_manager.py &&
+ pytest -v -s lora/test_lora_utils.py &&
+ pytest -v -s lora/test_peft_helper.py &&
+ pytest -v -s lora/test_resolver.py &&
+ pytest -v -s lora/test_utils.py &&
+ pytest -v -s lora/test_add_lora.py &&
+ pytest -v -s lora/test_worker.py'
+
+- label: LoRA Fused/MoE Kernels
+ timeout_in_minutes: 45
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ pytest -v -s lora/test_fused_moe_lora_kernel.py &&
+ pytest -v -s lora/test_moe_lora_align_sum.py --deselect="tests/lora/test_moe_lora_align_sum.py::test_moe_lora_align_block_size_mixed_base_and_lora[1]"'
+
+- label: LoRA Punica Kernels
+ timeout_in_minutes: 45
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ set -o pipefail &&
+ pytest -v -s lora/test_punica_ops.py --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-3-43264-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype1-1-2049-64-128-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-1-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-1-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-8-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype0-3-2049-128-8-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-8-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype1-1-2049-256-128-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-3-64256-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-2-29696-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-3-49408-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-2-16384-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-2-51328-32-4-4]"'
+
+- label: LoRA Punica FP8/XPU Ops
+ timeout_in_minutes: 45
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ pytest -v -s lora/test_punica_ops_fp8.py &&
+ pytest -v -s lora/test_punica_xpu_ops.py'
+
+- label: LoRA Models
+ timeout_in_minutes: 45
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ (pytest -v -s lora/test_mixtral.py --deselect="tests/lora/test_mixtral.py::test_mixtral_lora[4]" || true) &&
+ pytest -v -s lora/test_quant_model.py --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model0]" --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model1]" --deselect="tests/lora/test_quant_model.py::test_quant_model_tp_equality[model0]" &&
+ pytest -v -s lora/test_transformers_model.py &&
+ pytest -v -s lora/test_chatglm3_tp.py &&
+ pytest -s -v lora/test_minicpmv_tp.py'
+
+- label: LoRA Multimodal
+ timeout_in_minutes: 45
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ pytest -v -s lora/test_default_mm_loras.py &&
+ pytest -v -s lora/test_whisper.py'
diff --git a/.buildkite/intel_jobs/misc_intel.yaml b/.buildkite/intel_jobs/misc_intel.yaml
new file mode 100644
index 000000000000..864128bb5338
--- /dev/null
+++ b/.buildkite/intel_jobs/misc_intel.yaml
@@ -0,0 +1,55 @@
+group: Miscellaneous Intel
+depends_on:
+ - image-build-xpu
+steps:
+- label: V1 Core + KV + Metrics
+ timeout_in_minutes: 30
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/
+ - tests/v1/core
+ - tests/v1/executor
+ - tests/v1/kv_offload
+ - tests/v1/worker
+ - tests/v1/kv_connector/unit
+ - tests/v1/metrics
+ - tests/entrypoints/openai/correctness/test_lmeval.py
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'pip install -r requirements/kv_connectors.txt &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ cd tests &&
+ pytest -v -s v1/executor'
+
+- label: V1 Sample + Logits
+ timeout_in_minutes: 30
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/
+ - tests/v1/sample
+ - tests/v1/logits_processors
+ - tests/v1/test_oracle.py
+ - tests/v1/test_request.py
+ - tests/v1/test_outputs.py
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ cd tests &&
+ pytest -v -s v1/logits_processors --ignore=v1/logits_processors/test_custom_online.py --ignore=v1/logits_processors/test_custom_offline.py &&
+ pytest -v -s v1/test_oracle.py &&
+ pytest -v -s v1/test_request.py &&
+ pytest -v -s v1/test_outputs.py'
diff --git a/.buildkite/intel_jobs/test-intel.yaml b/.buildkite/intel_jobs/test-intel.yaml
index c59be699502f..c14a6f0f4f81 100644
--- a/.buildkite/intel_jobs/test-intel.yaml
+++ b/.buildkite/intel_jobs/test-intel.yaml
@@ -36,9 +36,12 @@ steps:
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN &&
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 &&
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --kv-cache-dtype fp8 &&
+ python3 examples/basic/offline_inference/generate.py --model nvidia/Llama-3.1-8B-Instruct-FP8 --block-size 64 --enforce-eager --quantization modelopt --kv-cache-dtype fp8 --attention-backend TRITON_ATTN --max-model-len 4096 &&
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192 &&
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 &&
- python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel'
+ python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel &&
+ python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --max-model-len 8192
+ '
- label: "XPU V1 test"
depends_on:
- image-build-xpu
@@ -61,5 +64,5 @@ steps:
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py &&
pytest -v -s v1/structured_output &&
pytest -v -s v1/test_serial_utils.py &&
- pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py &&
- pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py --ignore=v1/kv_connector/unit/test_hf3fs_client.py --ignore=v1/kv_connector/unit/test_hf3fs_connector.py --ignore=v1/kv_connector/unit/test_hf3fs_metadata_server.py'
+ pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py &&
+ pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py --ignore=v1/kv_connector/unit/test_hf3fs_client.py --ignore=v1/kv_connector/unit/test_hf3fs_connector.py --ignore=v1/kv_connector/unit/test_hf3fs_metadata_server.py --ignore=v1/kv_connector/unit/test_offloading_connector.py'
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
index 518af9a66018..b495c0d123a6 100755
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
-# pip install "lm-eval[api]>=0.4.11"
+# pip install "lm-eval[api]>=0.4.12"
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
index f010ffe6752d..e430e6183b2d 100755
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
-# pip install "lm-eval[api]>=0.4.11"
+# pip install "lm-eval[api]>=0.4.12"
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index fec4a94e63e4..f1a541ddbefc 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
-# pip install "lm-eval[api]>=0.4.11"
+# pip install "lm-eval[api]>=0.4.12"
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
index e3c6e16bd6b3..ba8da9fc3f55 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
-# pip install "lm-eval[api]>=0.4.11"
+# pip install "lm-eval[api]>=0.4.12"
usage() {
echo``
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
index 6c4591f05b3b..34c2cc82d395 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -31,30 +31,9 @@
}
},
"tests": [
- {
- "test_name": "serving_llama8B_tp1_sharegpt",
- "server_parameters": {
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
- }
- },
- {
- "test_name": "serving_llama8B_tp2_sharegpt",
- "server_parameters": {
- "tensor_parallel_size": 2
- },
- "client_parameters": {
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
- }
- },
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
@@ -63,290 +42,244 @@
}
},
{
- "test_name": "serving_llama8B_tp2_random_128_128",
+ "test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 2
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
},
"client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_tp4_random_128_128",
+ "test_name": "serving_llama8B_int8_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 4
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
},
"client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_tp1_random_128_2048",
- "server_parameters": {
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 2048
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_128_2048",
+ "test_name": "serving_llama1B_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 2
+ "model": "meta-llama/Llama-3.2-1B"
},
"client_parameters": {
+ "model": "meta-llama/Llama-3.2-1B",
"dataset_name": "random",
"random-input-len": 128,
- "random-output-len": 2048
+ "random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_tp4_random_128_2048",
+ "test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 4
+ "model": "meta-llama/Llama-3.2-3B-Instruct"
},
"client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
- "random-output-len": 2048
- }
- },
- {
- "test_name": "serving_llama8B_tp1_random_2048_128",
- "server_parameters": {
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "dataset_name": "random",
- "random-input-len": 2048,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_tp2_random_2048_128",
+ "test_name": "serving_llama70B_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 2
+ "model": "meta-llama/Llama-3.3-70B-Instruct"
},
"client_parameters": {
+ "model": "meta-llama/Llama-3.3-70B-Instruct",
"dataset_name": "random",
- "random-input-len": 2048,
+ "random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_tp4_random_2048_128",
+ "test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 4
+ "model": "ibm-granite/granite-3.2-2b-instruct"
},
"client_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
- "random-input-len": 2048,
+ "random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_tp1_random_2048_2048",
- "server_parameters": {
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 2048
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_2048_2048",
- "server_parameters": {
- "tensor_parallel_size": 2
- },
- "client_parameters": {
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 2048
- }
- },
- {
- "test_name": "serving_llama8B_tp4_random_2048_2048",
- "server_parameters": {
- "tensor_parallel_size": 4
- },
- "client_parameters": {
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 2048
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp1_random_128_128",
+ "test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "tensor_parallel_size": 1
+ "model": "Qwen/Qwen3-1.7B"
},
"client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "tensor_parallel_size": 2
+ "model": "Qwen/Qwen3-4B"
},
"client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
+ "test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "tensor_parallel_size": 4
+ "model": "Qwen/Qwen3-8B"
},
"client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_int8_tp1_random_128_128",
+ "test_name": "serving_qwen14B_tp1_random_128_128",
"server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 1
+ "model": "Qwen/Qwen3-14B"
},
"client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "model": "Qwen/Qwen3-14B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_int8_tp2_random_128_128",
+ "test_name": "serving_qwen30B_tp1_random_128_128",
"server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2
+ "model": "Qwen/Qwen3-30B-A3B"
},
"client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "model": "Qwen/Qwen3-30B-A3B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_int8_tp4_random_128_128",
+ "test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4
+ "model": "zai-org/glm-4-9b-hf"
},
"client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama3B_tp1_random_128_128",
+ "test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
- "model": "meta-llama/Llama-3.2-3B-Instruct",
- "tensor_parallel_size": 1
+ "model": "google/gemma-7b"
},
"client_parameters": {
- "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_granite2B_tp1_random_128_128",
+ "test_name": "serving_gemma3-4b_tp1_random_128_128",
+ "server_environment_variables": {
+ "VLLM_CPU_SGL_KERNEL": 0
+ },
"server_parameters": {
- "model": "ibm-granite/granite-3.2-2b-instruct",
- "tensor_parallel_size": 1
+ "model": "google/gemma-3-4b-it"
},
"client_parameters": {
- "model": "ibm-granite/granite-3.2-2b-instruct",
+ "model": "google/gemma-3-4b-it",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_qwen1.7B_tp1_random_128_128",
+ "test_name": "serving_gemma3-12b_tp1_random_128_128",
+ "server_environment_variables": {
+ "VLLM_CPU_SGL_KERNEL": 0
+ },
"server_parameters": {
- "model": "Qwen/Qwen3-1.7B",
- "tensor_parallel_size": 1
+ "model": "google/gemma-3-12b-it"
},
"client_parameters": {
- "model": "Qwen/Qwen3-1.7B",
+ "model": "google/gemma-3-12b-it",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_qwen4B_tp1_random_128_128",
+ "test_name": "serving_gemma4-4b_tp1_random_128_128",
+ "server_environment_variables": {
+ "VLLM_CPU_SGL_KERNEL": 0
+ },
"server_parameters": {
- "model": "Qwen/Qwen3-4B",
- "tensor_parallel_size": 1
+ "model": "google/gemma-4-E4B-it"
},
"client_parameters": {
- "model": "Qwen/Qwen3-4B",
+ "model": "google/gemma-4-E4B-it",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_qwen8B_tp1_random_128_128",
+ "test_name": "serving_gemma4-2b_tp1_random_128_128",
+ "server_environment_variables": {
+ "VLLM_CPU_SGL_KERNEL": 0
+ },
"server_parameters": {
- "model": "Qwen/Qwen3-8B",
- "tensor_parallel_size": 1
+ "model": "google/gemma-4-E2B-it"
},
"client_parameters": {
- "model": "Qwen/Qwen3-8B",
+ "model": "google/gemma-4-E2B-it",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_glm9B_tp1_random_128_128",
+ "test_name": "serving_gemma4-26b_tp1_random_128_128",
+ "server_environment_variables": {
+ "VLLM_CPU_SGL_KERNEL": 0,
+ "VLLM_CPU_ATTN_SPLIT_KV": 0
+ },
"server_parameters": {
- "model": "zai-org/glm-4-9b-hf",
- "tensor_parallel_size": 1
+ "model": "google/gemma-4-26B-A4B-it"
},
"client_parameters": {
- "model": "zai-org/glm-4-9b-hf",
+ "model": "google/gemma-4-26B-A4B-it",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_gemma7B_tp1_random_128_128",
+ "test_name": "serving_phi4_tp1_random_128_128",
"server_parameters": {
- "model": "google/gemma-7b",
- "tensor_parallel_size": 1
+ "model": "microsoft/Phi-4-reasoning"
},
"client_parameters": {
- "model": "google/gemma-7b",
+ "model": "microsoft/Phi-4-reasoning",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index b3a6bb8ed4cf..df9b80f7f9a8 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,3 +1,16 @@
+# CUDA architecture lists — following PyTorch RELEASE.md
+# (https://github.com/pytorch/pytorch/blob/main/RELEASE.md)
+# SM86 included for broader Ampere coverage; SM89 for marlin fp8 support
+env:
+ CUDA_ARCH_X86: "7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX"
+ # aarch64 only architectures: 8.7 for Orin, 11.0 for Thor (since CUDA 13)
+ CUDA_ARCH_AARCH64: "8.0 8.7 8.9 9.0 10.0 11.0 12.0+PTX"
+ CUDA_ARCH_X86_CU129: "7.5 8.0 8.6 8.9 9.0 10.0 12.0"
+ CUDA_ARCH_AARCH64_CU129: "8.0 8.7 8.9 9.0 10.0 12.0"
+ MOONCAKE_WHEEL_AARCH64_2_35: "https://vllm-wheels.s3.amazonaws.com/mooncake/mooncake_transfer_engine-0.3.10.post2-0da9dfea3-cp312-cp312-manylinux_2_35_aarch64.whl"
+ MOONCAKE_WHEEL_AARCH64_2_39: "https://vllm-wheels.s3.amazonaws.com/mooncake/mooncake_transfer_engine-0.3.10.post2-0da9dfea3-cp312-cp312-manylinux_2_39_aarch64.whl"
+ MOONCAKE_WHEEL_X86_64: "https://vllm-wheels.s3.amazonaws.com/mooncake/mooncake_transfer_engine-0.3.10.post2-0da9dfea3-cp312-cp312-manylinux_2_35_x86_64.whl"
+
steps:
- input: "Provide Release version here"
id: input-release-version
@@ -14,12 +27,11 @@ steps:
agents:
queue: arm64_cpu_queue_release
commands:
- # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
- # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64_CU129}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinuxaarch64-builder:cuda12.9 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"'
env:
DOCKER_BUILDKIT: "1"
@@ -29,12 +41,11 @@ steps:
agents:
queue: arm64_cpu_queue_release
commands:
- # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
- # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinuxaarch64-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"'
env:
DOCKER_BUILDKIT: "1"
@@ -47,7 +58,8 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"'
env:
DOCKER_BUILDKIT: "1"
@@ -57,10 +69,11 @@ steps:
agents:
queue: cpu_queue_release
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86_CU129}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinux2_28-builder:cuda12.9 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"'
env:
DOCKER_BUILDKIT: "1"
@@ -70,10 +83,11 @@ steps:
agents:
queue: cpu_queue_release
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinux2_28-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"'
env:
DOCKER_BUILDKIT: "1"
@@ -86,7 +100,8 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"'
env:
DOCKER_BUILDKIT: "1"
@@ -108,102 +123,226 @@ steps:
depends_on: block-build-release-images
allow_dependency_failure: true
steps:
- - label: "Build release image - x86_64 - CUDA 12.9"
+ - label: "Build release image - x86_64 - CUDA 13.0"
depends_on: ~
id: build-release-image-x86
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=13.0.2 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"'
- - label: "Build release image - aarch64 - CUDA 12.9"
+ - label: "Build release image - aarch64 - CUDA 13.0"
depends_on: ~
id: build-release-image-arm64
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=13.0.2 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"'
- - label: "Build release image - x86_64 - CUDA 13.0"
+ - label: "Build release image - x86_64 - CUDA 12.9"
depends_on: ~
- id: build-release-image-x86-cuda-13-0
+ id: build-release-image-x86-cuda-12-9
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=12.9.1 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86_CU129}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"
# re-tag to default image tag and push, just in case arm64 build fails
- - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129"
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"'
- - label: "Build release image - aarch64 - CUDA 13.0"
+ - label: "Build release image - aarch64 - CUDA 12.9"
depends_on: ~
- id: build-release-image-arm64-cuda-13-0
+ id: build-release-image-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=12.9.1 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64_CU129}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"'
- - label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04"
+ - label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-x86-ubuntu2404
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh ubuntu2404) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=13.0.2 \
+ --build-arg UBUNTU_VERSION=24.04 \
+ --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"'
- - label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04"
+ - label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-arm64-ubuntu2404
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh ubuntu2404) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=13.0.2 \
+ --build-arg UBUNTU_VERSION=24.04 \
+ --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"'
- - label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04"
+ - label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04"
depends_on: ~
- id: build-release-image-x86-cuda-13-0-ubuntu2404
+ id: build-release-image-x86-cuda-12-9-ubuntu2404
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
- - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
- - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129-ubuntu2404) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=12.9.1 \
+ --build-arg UBUNTU_VERSION=24.04 \
+ --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86_CU129}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404"
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"'
- - label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04"
+ - label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04"
depends_on: ~
- id: build-release-image-arm64-cuda-13-0-ubuntu2404
+ id: build-release-image-arm64-cuda-12-9-ubuntu2404
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129-ubuntu2404) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=12.9.1 \
+ --build-arg UBUNTU_VERSION=24.04 \
+ --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64_CU129}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"'
- block: "Build release image for x86_64 CPU"
key: block-cpu-release-image-build
depends_on: ~
- label: "Build release image - x86_64 - CPU"
+ key: build-cpu-release-image-x86
depends_on:
- block-cpu-release-image-build
- input-release-version
@@ -214,6 +353,7 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"'
env:
DOCKER_BUILDKIT: "1"
@@ -222,7 +362,8 @@ steps:
depends_on: ~
- label: "Build release image - arm64 - CPU"
- depends_on:
+ key: build-cpu-release-image-arm64
+ depends_on:
- block-arm64-cpu-release-image-build
- input-release-version
agents:
@@ -232,13 +373,14 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"'
env:
DOCKER_BUILDKIT: "1"
- group: "Publish release images"
key: "publish-release-images"
steps:
- - label: "Create multi-arch manifest - CUDA 12.9"
+ - label: "Create multi-arch manifest - CUDA 13.0"
depends_on:
- build-release-image-x86
- build-release-image-arm64
@@ -249,29 +391,22 @@ steps:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 13.0" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"'
- - label: "Annotate release workflow - CUDA 12.9"
- depends_on:
- - create-multi-arch-manifest
- id: annotate-release-workflow
- agents:
- queue: small_cpu_queue_release
- commands:
- - "bash .buildkite/scripts/annotate-release.sh"
-
- - label: "Create multi-arch manifest - CUDA 13.0"
+ - label: "Create multi-arch manifest - CUDA 12.9"
depends_on:
- - build-release-image-x86-cuda-13-0
- - build-release-image-arm64-cuda-13-0
- id: create-multi-arch-manifest-cuda-13-0
+ - build-release-image-x86-cuda-12-9
+ - build-release-image-arm64-cuda-12-9
+ id: create-multi-arch-manifest-cuda-12-9
agents:
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
- - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+ - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu129 --amend"
+ - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 12.9" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129"'
- - label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04"
+ - label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04"
depends_on:
- build-release-image-x86-ubuntu2404
- build-release-image-arm64-ubuntu2404
@@ -282,18 +417,20 @@ steps:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-ubuntu2404 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 13.0 Ubuntu 24.04" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"'
- - label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04"
+ - label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04"
depends_on:
- - build-release-image-x86-cuda-13-0-ubuntu2404
- - build-release-image-arm64-cuda-13-0-ubuntu2404
- id: create-multi-arch-manifest-cuda-13-0-ubuntu2404
+ - build-release-image-x86-cuda-12-9-ubuntu2404
+ - build-release-image-arm64-cuda-12-9-ubuntu2404
+ id: create-multi-arch-manifest-cuda-12-9-ubuntu2404
agents:
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130-ubuntu2404 --amend"
- - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
+ - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu129-ubuntu2404 --amend"
+ - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 12.9 Ubuntu 24.04" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404"'
- label: "Publish nightly multi-arch image to DockerHub"
depends_on:
@@ -313,16 +450,16 @@ steps:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- - label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
+ - label: "Publish nightly multi-arch image to DockerHub - CUDA 12.9"
depends_on:
- - create-multi-arch-manifest-cuda-13-0
+ - create-multi-arch-manifest-cuda-12-9
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_release
commands:
- - "bash .buildkite/scripts/push-nightly-builds.sh cu130"
+ - "bash .buildkite/scripts/push-nightly-builds.sh cu129"
# Clean up old nightly builds (keep only last 14)
- - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
+ - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu129-nightly-"
plugins:
- docker-login#v3.0.0:
username: vllmbot
@@ -331,24 +468,6 @@ steps:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- - group: "Publish wheels"
- key: "publish-wheels"
- steps:
- - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
- key: block-upload-release-wheels
- depends_on:
- - input-release-version
- - build-wheels
-
- - label: "Upload release wheels to PyPI"
- depends_on:
- - block-upload-release-wheels
- id: upload-release-wheels
- agents:
- queue: small_cpu_queue_release
- commands:
- - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
-
# =============================================================================
# ROCm Release Pipeline (x86_64 only)
# =============================================================================
@@ -462,7 +581,7 @@ steps:
echo ""
echo " Build complete - Image and wheels cached"
fi
-
+
artifact_paths:
- "artifacts/rocm-base-wheels/*.whl"
env:
@@ -618,7 +737,7 @@ steps:
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
env:
S3_BUCKET: "vllm-wheels"
- VARIANT: "rocm721"
+ VARIANT: "rocm722"
# ROCm Job 6: Build ROCm Release Docker Image
- label: ":docker: Build release image - x86_64 - ROCm"
@@ -678,7 +797,7 @@ steps:
# Push to ECR
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
-
+
echo ""
echo " Successfully built and pushed ROCm release image"
echo " Image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
@@ -705,3 +824,60 @@ steps:
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
+
+ # =============================================================================
+ # Publish to DockerHub and PyPI (at the end so all builds complete first)
+ # =============================================================================
+
+ - block: "Publish release images to DockerHub"
+ key: block-publish-release-images
+ depends_on:
+ - create-multi-arch-manifest
+ - create-multi-arch-manifest-cuda-12-9
+ - create-multi-arch-manifest-ubuntu2404
+ - create-multi-arch-manifest-cuda-12-9-ubuntu2404
+ - build-rocm-release-image
+ - input-release-version
+ # Wait for CPU builds if their block steps were unblocked, so publish
+ # doesn't race the in-progress CPU build. allow_failure lets publish
+ # proceed when the operator legitimately leaves the CPU block steps
+ # unblocked or the CPU build fails.
+ - step: build-cpu-release-image-x86
+ allow_failure: true
+ - step: build-cpu-release-image-arm64
+ allow_failure: true
+ if: build.env("NIGHTLY") != "1"
+
+ - label: "Publish release images to DockerHub"
+ depends_on:
+ - block-publish-release-images
+ key: publish-release-images-dockerhub
+ agents:
+ queue: small_cpu_queue_release
+ commands:
+ - "bash .buildkite/scripts/publish-release-images.sh"
+ plugins:
+ - docker-login#v3.0.0:
+ username: vllmbot
+ password-env: DOCKERHUB_TOKEN
+ env:
+ DOCKER_BUILDKIT: "1"
+ DOCKERHUB_USERNAME: "vllmbot"
+
+ - group: "Publish wheels"
+ key: "publish-wheels"
+ steps:
+ - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
+ key: block-upload-release-wheels
+ depends_on:
+ - input-release-version
+ - build-wheels
+
+ - label: "Upload release wheels to PyPI"
+ depends_on:
+ - block-upload-release-wheels
+ id: upload-release-wheels
+ agents:
+ queue: small_cpu_queue_release
+ commands:
+ - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
diff --git a/.buildkite/scripts/annotate-build-artifact.sh b/.buildkite/scripts/annotate-build-artifact.sh
new file mode 100755
index 000000000000..67cdf7923658
--- /dev/null
+++ b/.buildkite/scripts/annotate-build-artifact.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Append a build artifact line to the Buildkite annotation.
+# Usage: annotate-build-artifact.sh
-🔥 We have built a vllm website to help you get started with vllm. Please visit [vllm.ai](https://vllm.ai) to learn more.
+🔥 We have built a vLLM website to help you get started with vLLM. Please visit [vllm.ai](https://vllm.ai) to learn more.
For events, please visit [vllm.ai/events](https://vllm.ai/events) to join us.
---
@@ -50,7 +50,7 @@ vLLM is flexible and easy to use with:
- Efficient multi-LoRA support for dense and MoE layers
- Support for NVIDIA GPUs, AMD GPUs, and x86/ARM/PowerPC CPUs. Additionally, diverse hardware plugins such as Google TPUs, Intel Gaudi, IBM Spyre, Huawei Ascend, Rebellions NPU, Apple Silicon, MetaX GPU, and more.
-vLLM seamlessly supports 200+ model architectures on HuggingFace, including:
+vLLM seamlessly supports 200+ model architectures on Hugging Face, including:
- Decoder-only LLMs (e.g., Llama, Qwen, Gemma)
- Mixture-of-Expert LLMs (e.g., Mixtral, DeepSeek-V3, Qwen-MoE, GPT-OSS)
diff --git a/tests/entrypoints/openai/realtime/__init__.py b/benchmarks/__init__.py
similarity index 100%
rename from tests/entrypoints/openai/realtime/__init__.py
rename to benchmarks/__init__.py
diff --git a/benchmarks/attention_benchmarks/configs/mla_decode.yaml b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
index d758654dbe80..8f12ac723064 100644
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -53,6 +53,7 @@ backends:
- FLASHINFER_MLA
- FLASH_ATTN_MLA # Hopper only
- FLASHMLA # Hopper only
+ - TOKENSPEED_MLA # Blackwell + R1 dims + FP8 KV (use --kv-cache-dtype fp8)
device: "cuda:0"
repeats: 100
diff --git a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
index 122dbd783c5b..1e1ab264bace 100644
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -3,6 +3,7 @@
# Compares all available MLA prefill backends:
# FA backends: fa2, fa3, fa4 (FlashAttention versions)
# Non-FA: flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer)
+# CuTe DSL: tokenspeed (Blackwell + R1 dims, requires tokenspeed_mla)
#
# Uses cutlass_mla as the decode backend for impl construction
# (only the prefill path is exercised).
@@ -120,6 +121,7 @@ prefill_backends:
- flashinfer
- cudnn
- trtllm
+ - tokenspeed
device: "cuda:0"
repeats: 20
diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index f8bc7b4a10ed..abab1e2edbac 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -29,6 +29,7 @@
VllmConfig,
set_current_vllm_config,
)
+from vllm.v1.attention.backends.mla.prefill.registry import MLAPrefillBackendEnum
# ============================================================================
# VllmConfig Creation
@@ -79,8 +80,8 @@ def create_minimal_vllm_config(
index_topk: Optional topk value for sparse MLA backends. If provided,
the config will include index_topk for sparse attention.
prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
- "cudnn", "trtllm"). Configures the attention config to
- force the specified prefill backend.
+ "trtllm"). Configures the attention config to force
+ the specified prefill backend.
Returns:
VllmConfig for benchmarking
@@ -179,19 +180,13 @@ def create_minimal_vllm_config(
if prefill_backend is not None:
prefill_cfg = get_prefill_backend_config(prefill_backend)
+ vllm_config.attention_config.mla_prefill_backend = prefill_cfg[
+ "mla_prefill_backend"
+ ]
if prefill_cfg["flash_attn_version"] is not None:
vllm_config.attention_config.flash_attn_version = prefill_cfg[
"flash_attn_version"
]
- vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
- "disable_flashinfer_prefill"
- ]
- vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
- "use_cudnn_prefill"
- ]
- vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
- "use_trtllm_ragged_deepseek_prefill"
- ]
return vllm_config
@@ -206,39 +201,27 @@ def create_minimal_vllm_config(
_PREFILL_BACKEND_CONFIG: dict[str, dict] = {
"fa2": {
"flash_attn_version": 2,
- "disable_flashinfer_prefill": True,
- "use_cudnn_prefill": False,
- "use_trtllm_ragged_deepseek_prefill": False,
+ "mla_prefill_backend": MLAPrefillBackendEnum.FLASH_ATTN,
},
"fa3": {
"flash_attn_version": 3,
- "disable_flashinfer_prefill": True,
- "use_cudnn_prefill": False,
- "use_trtllm_ragged_deepseek_prefill": False,
+ "mla_prefill_backend": MLAPrefillBackendEnum.FLASH_ATTN,
},
"fa4": {
"flash_attn_version": 4,
- "disable_flashinfer_prefill": True,
- "use_cudnn_prefill": False,
- "use_trtllm_ragged_deepseek_prefill": False,
+ "mla_prefill_backend": MLAPrefillBackendEnum.FLASH_ATTN,
},
"flashinfer": {
"flash_attn_version": None,
- "disable_flashinfer_prefill": False,
- "use_cudnn_prefill": False,
- "use_trtllm_ragged_deepseek_prefill": False,
+ "mla_prefill_backend": MLAPrefillBackendEnum.FLASHINFER,
},
- "cudnn": {
+ "trtllm": {
"flash_attn_version": None,
- "disable_flashinfer_prefill": True,
- "use_cudnn_prefill": True,
- "use_trtllm_ragged_deepseek_prefill": False,
+ "mla_prefill_backend": MLAPrefillBackendEnum.TRTLLM_RAGGED,
},
- "trtllm": {
+ "tokenspeed": {
"flash_attn_version": None,
- "disable_flashinfer_prefill": True,
- "use_cudnn_prefill": False,
- "use_trtllm_ragged_deepseek_prefill": True,
+ "mla_prefill_backend": MLAPrefillBackendEnum.TOKENSPEED_MLA,
},
}
@@ -404,6 +387,7 @@ def _build_attention_metadata(
query_start_loc=q_start_gpu,
query_start_loc_cpu=q_start_cpu,
seq_lens=seq_lens_gpu,
+ seq_lens_cpu_upper_bound=seq_lens_cpu,
_seq_lens_cpu=seq_lens_cpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
slot_mapping=slot_mapping,
@@ -624,6 +608,21 @@ def _create_backend_impl(
# Create mock layer
layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)
+ # Attach a prefill backend (MLAAttention does this in __init__; the metadata
+ # builder reads layer.prefill_backend from static_forward_context).
+ from vllm.v1.attention.backends.mla.prefill import get_mla_prefill_backend
+
+ prefill_backend_cls = get_mla_prefill_backend(vllm_config)
+ layer.prefill_backend = prefill_backend_cls(
+ num_heads=mla_dims["num_q_heads"],
+ scale=(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]) ** -0.5,
+ kv_lora_rank=mla_dims["kv_lora_rank"],
+ qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+ qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
+ v_head_dim=mla_dims["v_head_dim"],
+ vllm_config=vllm_config,
+ )
+
# Create builder instance if needed
builder_instance = None
if builder_class:
@@ -960,19 +959,6 @@ def _run_mla_benchmark_batched(
results = []
with set_current_vllm_config(vllm_config):
- # Clear cached prefill backend detection functions so they re-evaluate
- # with the current VllmConfig. These are @functools.cache decorated and
- # would otherwise return stale results from a previous backend's config.
- from vllm.model_executor.layers.attention.mla_attention import (
- use_cudnn_prefill,
- use_flashinfer_prefill,
- use_trtllm_ragged_deepseek_prefill,
- )
-
- use_flashinfer_prefill.cache_clear()
- use_cudnn_prefill.cache_clear()
- use_trtllm_ragged_deepseek_prefill.cache_clear()
-
# Create backend impl, layer, builder, and indexer (reused across benchmarks)
impl, layer, builder_instance, indexer = _create_backend_impl(
backend_cfg,
@@ -984,38 +970,36 @@ def _run_mla_benchmark_batched(
kv_cache_dtype=kv_cache_dtype,
)
- # Verify the actual prefill backend matches what was requested
+ # Verify the actual prefill backend matches what was requested. The
+ # selector + impl construction already raise on misuse; here we just
+ # check the resolved class against the requested name as a sanity guard.
if prefill_backend is not None:
- prefill_cfg = get_prefill_backend_config(prefill_backend)
- fa_version = prefill_cfg["flash_attn_version"]
-
- if fa_version is not None:
- # FA backend: verify the impl's FA version
- actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
+ expected_class = {
+ "fa2": "FlashAttnPrefillBackend",
+ "fa3": "FlashAttnPrefillBackend",
+ "fa4": "FlashAttnPrefillBackend",
+ "flashinfer": "FlashInferPrefillBackend",
+ "trtllm": "TrtllmRaggedPrefillBackend",
+ "tokenspeed": "TokenspeedMLAPrefillBackend",
+ }.get(prefill_backend)
+ actual_class = type(getattr(layer, "prefill_backend", None)).__name__
+ if expected_class and actual_class != expected_class:
+ raise RuntimeError(
+ f"Prefill backend '{prefill_backend}' requested "
+ f"{expected_class}, got {actual_class}. Check "
+ f"attention_config plumbing or installed deps."
+ )
+ if prefill_backend in {"fa2", "fa3", "fa4"}:
+ fa_version = int(prefill_backend[2:])
+ actual_fa_version = getattr(
+ layer.prefill_backend, "vllm_flash_attn_version", None
+ )
if actual_fa_version != fa_version:
raise RuntimeError(
f"Prefill backend '{prefill_backend}' requested FA "
- f"version {fa_version}, but the impl is using FA "
- f"version {actual_fa_version}. Check "
- f"vllm/v1/attention/backends/fa_utils.py."
+ f"version {fa_version}, got "
+ f"{actual_fa_version} on {actual_class}."
)
- else:
- # Non-FA backend: verify the builder picked the right path
- expected_flags = {
- "flashinfer": "_use_fi_prefill",
- "cudnn": "_use_cudnn_prefill",
- "trtllm": "_use_trtllm_ragged_prefill",
- }
- flag_name = expected_flags.get(prefill_backend)
- if flag_name and not getattr(builder_instance, flag_name, False):
- raise RuntimeError(
- f"Prefill backend '{prefill_backend}' was requested "
- f"but the metadata builder did not enable it. This "
- f"usually means a dependency is missing (e.g., "
- f"flashinfer not installed) or the platform doesn't "
- f"support it."
- )
-
# Run each benchmark with the shared impl
for config, threshold, num_splits in configs_with_params:
# Set threshold for this benchmark (FlashAttn/FlashMLA only)
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 33aca831883a..664fa58dd49f 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -115,6 +115,39 @@ class SampleRequest:
def sample_requests(
tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace
) -> list[SampleRequest]:
+ def _apply_random_prefix(
+ tokenizer: PreTrainedTokenizerBase,
+ requests: list[SampleRequest],
+ prefix_len: int,
+ seed: int,
+ ) -> list[SampleRequest]:
+ if prefix_len <= 0:
+ return requests
+ rng = np.random.default_rng(seed)
+ vocab_size = tokenizer.vocab_size
+ prohibited = getattr(tokenizer, "all_special_ids", None) or []
+ allowed = np.array([i for i in range(vocab_size) if i not in prohibited])
+ if len(allowed) == 0:
+ return requests
+ prefix_ids = rng.integers(0, len(allowed), size=prefix_len)
+ prefix_token_ids = allowed[prefix_ids].tolist()
+ out = []
+ for req in requests:
+ prompt_ids = tokenizer(req.prompt, add_special_tokens=False).input_ids
+ full_ids = prefix_token_ids + prompt_ids
+ full_prompt = tokenizer.decode(full_ids, skip_special_tokens=False)
+ out.append(
+ SampleRequest(
+ prompt=full_prompt,
+ prompt_len=len(tokenizer(full_prompt).input_ids),
+ expected_output_len=req.expected_output_len,
+ schema=req.schema,
+ structure_type=req.structure_type,
+ completion=req.completion,
+ )
+ )
+ return out
+
if args.dataset == "json" or args.dataset == "json-unique":
if args.json_schema_path is None:
dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -261,6 +294,9 @@ def _filter_func(item):
)
)
+ requests = _apply_random_prefix(
+ tokenizer, requests, args.random_prefix_len, args.seed
+ )
return requests
@@ -945,6 +981,15 @@ def create_argument_parser():
"results in a more uniform arrival of requests.",
)
parser.add_argument("--seed", type=int, default=0)
+ parser.add_argument(
+ "--random-prefix-len",
+ type=int,
+ default=0,
+ help=(
+ "Number of prefix tokens to prepend to every prompt. "
+ "The same prefix is used for all prompts to enable prefix caching."
+ ),
+ )
parser.add_argument(
"--trust-remote-code",
action="store_true",
diff --git a/tests/entrypoints/openai/speech_to_text/__init__.py b/benchmarks/kernels/__init__.py
similarity index 100%
rename from tests/entrypoints/openai/speech_to_text/__init__.py
rename to benchmarks/kernels/__init__.py
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
index 3f80b024e108..03d7fb386f74 100644
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -16,7 +16,7 @@
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
index 2d4afd38c097..7379bf858889 100644
--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -22,7 +22,7 @@
fp8_w8a8_moe_quant_config,
nvfp4_moe_quant_config,
)
-from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
CutlassExpertsFp4,
)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index dd4060bbdb94..04fc2960d1e4 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -13,7 +13,7 @@
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
fused_topk,
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 65bc38c6c755..4463a23772ee 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -27,10 +27,10 @@
RoutingMethodType,
_get_config_dtype_str,
)
-from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
+from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser
diff --git a/benchmarks/kernels/benchmark_norm_router_gemm.py b/benchmarks/kernels/benchmark_norm_router_gemm.py
new file mode 100644
index 000000000000..cd50e9159961
--- /dev/null
+++ b/benchmarks/kernels/benchmark_norm_router_gemm.py
@@ -0,0 +1,183 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark and correctness check for ``ops.dsv4_norm_router_gemm``.
+
+Two implementations are compared:
+
+ 1. ``unfused`` — ``vllm_ops.rms_norm`` then ``ops.dsv3_router_gemm``,
+ i.e. the current vLLM hot path (two kernel launches).
+ 2. ``fused`` — ``ops.dsv4_norm_router_gemm``, the new single-kernel
+ fused path.
+
+Both produce ``(normed_x: bf16, router_logits: fp32)``. The correctness
+check verifies that ``fused`` and ``unfused`` agree to within ~1 bf16
+ULP — that is the precision floor for this op.
+"""
+
+import argparse
+
+import torch
+
+from vllm import _custom_ops as vllm_ops
+from vllm.triton_utils import triton
+
+# The fused dsv4_norm_router_gemm kernel is templated only for DSV4-Pro
+# (hidden_size=7168, num_experts=384). Other shapes fall back to the
+# unfused path on the Python side (NormGatedLinear), so benchmark only
+# the configuration that the fused kernel actually targets.
+HIDDEN_SIZE = 7168
+NUM_EXPERTS_CHOICES = (384,)
+RMS_EPS = 1e-6
+
+
+def unfused_norm_router_gemm(
+ x: torch.Tensor,
+ norm_weight: torch.Tensor,
+ gate_weight: torch.Tensor,
+ eps: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ # Call ``_C::rms_norm`` directly (mirroring ``_dsv4_pro_norm_gate``'s
+ # fallback path) so the benchmarked baseline doesn't inherit any
+ # Python wrapper overhead or risk falling through to the native
+ # eager-primitive ``RMSNorm.forward_native`` path.
+ normed = torch.empty_like(x)
+ torch.ops._C.rms_norm(normed, x, norm_weight, eps)
+ logits = vllm_ops.dsv3_router_gemm(normed, gate_weight, torch.float32)
+ return normed, logits
+
+
+def fused_norm_router_gemm(
+ x: torch.Tensor,
+ norm_weight: torch.Tensor,
+ gate_weight: torch.Tensor,
+ eps: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ return vllm_ops.dsv4_norm_router_gemm(x, norm_weight, gate_weight, eps)
+
+
+def _make_inputs(num_tokens: int, num_experts: int, hidden_size: int, seed: int = 0):
+ torch.manual_seed(seed)
+ device = "cuda"
+ x = torch.randn(num_tokens, hidden_size, dtype=torch.bfloat16, device=device)
+ norm_w = torch.randn(hidden_size, dtype=torch.bfloat16, device=device)
+ gate_w = torch.randn(num_experts, hidden_size, dtype=torch.bfloat16, device=device)
+ # Down-scale gate_w so the GEMV output stays in a representable range.
+ gate_w = gate_w / float(hidden_size) ** 0.5
+ norm_w = (norm_w * 0.1) + 1.0
+ return x, norm_w, gate_w
+
+
+def calculate_diff(
+ num_tokens: int,
+ num_experts: int,
+ hidden_size: int = HIDDEN_SIZE,
+ normed_atol: float = 2e-3,
+ logits_atol: float = 1e-2,
+ rtol: float = 1e-2,
+) -> None:
+ x, norm_w, gate_w = _make_inputs(num_tokens, num_experts, hidden_size)
+
+ normed_unfused, logits_unfused = unfused_norm_router_gemm(
+ x.clone(), norm_w, gate_w, RMS_EPS
+ )
+ normed_fused, logits_fused = fused_norm_router_gemm(
+ x.clone(), norm_w, gate_w, RMS_EPS
+ )
+
+ def _max_abs(a, b):
+ return (a.float() - b.float()).abs().max().item()
+
+ print(f"\n=== M={num_tokens} E={num_experts} H={hidden_size} ===")
+ print(f"normed_x |fused - unfused| = {_max_abs(normed_fused, normed_unfused):.3e}")
+ print(f"logits |fused - unfused| = {_max_abs(logits_fused, logits_unfused):.3e}")
+
+ ok_normed = torch.allclose(
+ normed_fused.float(),
+ normed_unfused.float(),
+ atol=normed_atol,
+ rtol=rtol,
+ )
+ ok_logits = torch.allclose(
+ logits_fused.float(),
+ logits_unfused.float(),
+ atol=logits_atol,
+ rtol=rtol,
+ )
+ if ok_normed and ok_logits:
+ print(
+ f"OK fused vs unfused within "
+ f"normed_atol={normed_atol:.0e} logits_atol={logits_atol:.0e} "
+ f"rtol={rtol:.0e}"
+ )
+ else:
+ print(
+ f"FAIL normed_ok={ok_normed} logits_ok={ok_logits}; "
+ f"see max-abs values above"
+ )
+
+
+def get_benchmark():
+ # Only num_tokens varies (DSV4-Pro hard-codes E=384); single-axis
+ # sweep yields a clean line plot with M on the x-axis.
+ num_experts = NUM_EXPERTS_CHOICES[0]
+
+ @triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["num_tokens"],
+ x_vals=list(range(1, 17)),
+ line_arg="provider",
+ line_vals=["unfused", "fused"],
+ line_names=["unfused (rms+dsv3)", "fused (dsv4)"],
+ styles=[("green", "-"), ("red", "-")],
+ ylabel="us",
+ plot_name=f"norm-router-gemm-E{num_experts}-H{HIDDEN_SIZE}",
+ args={},
+ )
+ )
+ def benchmark(num_tokens, provider):
+ x, norm_w, gate_w = _make_inputs(num_tokens, num_experts, HIDDEN_SIZE)
+
+ quantiles = [0.5, 0.2, 0.8]
+ if provider == "unfused":
+ fn = lambda: unfused_norm_router_gemm( # noqa: E731
+ x, norm_w, gate_w, RMS_EPS
+ )
+ else:
+ fn = lambda: fused_norm_router_gemm( # noqa: E731
+ x, norm_w, gate_w, RMS_EPS
+ )
+
+ ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)
+ return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+ return benchmark
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default="./configs/norm_router_gemm/",
+ )
+ parser.add_argument(
+ "--skip-bench",
+ action="store_true",
+ help="Run only the correctness check, not the perf sweep.",
+ )
+ args = parser.parse_args()
+
+ # Correctness sweep over the full fast-path range M=1..16.
+ for m in range(1, 17):
+ for e in NUM_EXPERTS_CHOICES:
+ calculate_diff(num_tokens=m, num_experts=e, hidden_size=HIDDEN_SIZE)
+
+ if args.skip_bench:
+ return
+
+ benchmark = get_benchmark()
+ benchmark.run(print_data=True, save_path=args.save_path)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/benchmark_vit_fp8_attn.py b/benchmarks/kernels/benchmark_vit_fp8_attn.py
new file mode 100644
index 000000000000..7d7a067dde9d
--- /dev/null
+++ b/benchmarks/kernels/benchmark_vit_fp8_attn.py
@@ -0,0 +1,324 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Benchmarks FP8 vs BF16 ViT attention via FlashInfer cuDNN backend.
+#
+# == Usage Examples ==
+#
+# Benchmark mode (default, FlashInfer CUDAGraph Bench)
+# python3 benchmark_vit_fp8_attn.py
+#
+# Profile mode (PyTorch profiler, saves TensorBoard traces):
+# python3 benchmark_vit_fp8_attn.py --profile
+# python3 benchmark_vit_fp8_attn.py --profile --profile-output-dir ./profile_traces
+#
+# Custom seq_lens:
+# python3 benchmark_vit_fp8_attn.py --seq-lens 4096 8192 16384
+
+from functools import partial
+
+import numpy as np
+import torch
+from torch.profiler import ProfilerActivity, profile, record_function
+
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+# Qwen3-VL defaults
+NUM_HEADS = 16
+HEAD_DIM = 72
+DEFAULT_SEQ_LENS = [2304, 4096, 8192, 16384]
+
+
+def _setup_fp8_attention(num_heads: int, head_dim: int) -> tuple:
+ """Create FP8 and BF16 attention modules + workspace."""
+ from types import SimpleNamespace
+ from unittest.mock import patch
+
+ from vllm.config import VllmConfig, set_current_vllm_config
+ from vllm.config.multimodal import MultiModalConfig
+ from vllm.model_executor.layers.attention.mm_encoder_attention import (
+ MMEncoderAttention,
+ _get_flashinfer_workspace_buffer,
+ )
+ from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+ old_dtype = torch.get_default_dtype()
+ torch.set_default_dtype(torch.bfloat16)
+
+ backend_patch = patch(
+ "vllm.model_executor.layers.attention.mm_encoder_attention"
+ ".get_vit_attn_backend",
+ return_value=AttentionBackendEnum.FLASHINFER,
+ )
+
+ # FP8 attention
+ mm_config_fp8 = MultiModalConfig(mm_encoder_attn_dtype="fp8")
+ vllm_config_fp8 = VllmConfig()
+ vllm_config_fp8.model_config = SimpleNamespace(multimodal_config=mm_config_fp8)
+ with set_current_vllm_config(vllm_config_fp8), backend_patch:
+ attn_fp8 = MMEncoderAttention(
+ num_heads=num_heads,
+ head_size=head_dim,
+ prefix="visual.blocks.0.attn",
+ ).to("cuda")
+
+ # BF16 attention (no FP8)
+ with set_current_vllm_config(VllmConfig()), backend_patch:
+ attn_bf16 = MMEncoderAttention(
+ num_heads=num_heads,
+ head_size=head_dim,
+ prefix="visual.blocks.0.attn",
+ ).to("cuda")
+
+ torch.set_default_dtype(old_dtype)
+
+ workspace = _get_flashinfer_workspace_buffer()
+ return attn_fp8, attn_bf16, workspace
+
+
+def _build_meta(
+ seq_len: int,
+ num_heads: int,
+ head_dim: int,
+ fp8: bool,
+):
+ """Build cu_seqlens, max_seqlen, sequence_lengths."""
+ from vllm.model_executor.layers.attention.mm_encoder_attention import (
+ MMEncoderAttention,
+ )
+ from vllm.utils.math_utils import round_up
+ from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+ cu_np = np.array([0, seq_len], dtype=np.int32)
+ fp8_padded = num_heads * round_up(head_dim, 16) if fp8 else None
+
+ seq_lengths = MMEncoderAttention.maybe_compute_seq_lens(
+ AttentionBackendEnum.FLASHINFER, cu_np, torch.device("cuda")
+ )
+ max_seqlen = torch.tensor(
+ MMEncoderAttention.compute_max_seqlen(AttentionBackendEnum.FLASHINFER, cu_np),
+ dtype=torch.int32,
+ )
+ cu_seqlens = MMEncoderAttention.maybe_recompute_cu_seqlens(
+ AttentionBackendEnum.FLASHINFER,
+ cu_np,
+ num_heads * head_dim,
+ 1,
+ torch.device("cuda"),
+ fp8_padded_hidden_size=fp8_padded,
+ )
+ return cu_seqlens, max_seqlen, seq_lengths
+
+
+def run_benchmark(
+ seq_lens: list[int],
+ num_heads: int,
+ head_dim: int,
+ method: str,
+):
+ """Benchmark FP8 vs BF16 attention across seq_lens.
+
+ Uses FlashInfer GPU-level timing to measure pure kernel time,
+ excluding CPU launch overhead.
+ """
+ if method == "cupti":
+ from flashinfer.testing import bench_gpu_time_with_cupti as bench_fn
+
+ bench_fn = partial(bench_fn, use_cuda_graph=True, cold_l2_cache=False)
+ elif method == "cudagraph":
+ from flashinfer.testing import (
+ bench_gpu_time_with_cudagraph as bench_fn,
+ )
+
+ bench_fn = partial(bench_fn, cold_l2_cache=False)
+ else:
+ raise ValueError(f"Invalid method: {method}")
+
+ attn_fp8, attn_bf16, workspace = _setup_fp8_attention(num_heads, head_dim)
+
+ print(f"Timing method: {method}")
+ print(f"{'seq_len':>8} {'BF16 (us)':>12} {'FP8 (us)':>12} {'Speedup':>10}")
+ print("-" * 46)
+
+ for seq_len in seq_lens:
+ torch.manual_seed(42)
+
+ q = torch.randn(
+ seq_len,
+ num_heads,
+ head_dim,
+ device="cuda",
+ dtype=torch.bfloat16,
+ )
+ k = torch.randn_like(q)
+ v = torch.randn_like(q)
+
+ cu_fp8, max_s, seq_l = _build_meta(seq_len, num_heads, head_dim, fp8=True)
+ # we can reuse cu_fp8 for cu_bf16 since q, k, and v are contiguous
+ cu_bf16 = cu_fp8.clone()
+
+ def bf16_fn(q=q, k=k, v=v, cu=cu_bf16, ms=max_s, sl=seq_l):
+ attn_bf16._forward_flashinfer(q, k, v, cu, ms, sl)
+
+ def fp8_fn(q=q, k=k, v=v, cu=cu_fp8, ms=max_s, sl=seq_l):
+ attn_fp8._forward_flashinfer(q, k, v, cu, ms, sl)
+
+ # bench_fn returns List[float] of per-iteration times in ms
+ bf16_times = bench_fn(bf16_fn)
+ fp8_times = bench_fn(fp8_fn)
+
+ bf16_us = np.median(bf16_times) * 1e3 # ms -> us
+ fp8_us = np.median(fp8_times) * 1e3
+ speedup = bf16_us / fp8_us if fp8_us > 0 else float("inf")
+
+ print(f"{seq_len:>8} {bf16_us:>12.1f} {fp8_us:>12.1f} {speedup:>9.2f}x")
+
+
+def _make_trace_handler(output_dir: str, worker_name: str, label: str):
+ """Create a trace handler that saves to TensorBoard and prints summary."""
+
+ def handler(prof):
+ torch.profiler.tensorboard_trace_handler(output_dir, worker_name)(prof)
+ print(f"\n{'=' * 80}")
+ print(label)
+ print(f"{'=' * 80}")
+ print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
+
+ return handler
+
+
+def run_profile(
+ seq_len: int,
+ num_heads: int,
+ head_dim: int,
+ warmup: int,
+ output_dir: str,
+):
+ """Profile FP8 vs BF16 attention with PyTorch profiler."""
+ attn_fp8, attn_bf16, workspace = _setup_fp8_attention(num_heads, head_dim)
+
+ torch.manual_seed(42)
+ q = torch.randn(
+ seq_len,
+ num_heads,
+ head_dim,
+ device="cuda",
+ dtype=torch.bfloat16,
+ )
+ k = torch.randn_like(q)
+ v = torch.randn_like(q)
+
+ cu_fp8, max_s, seq_l = _build_meta(seq_len, num_heads, head_dim, fp8=True)
+ # we can reuse cu_fp8 for cu_bf16 since q, k, and v are contiguous
+ cu_bf16 = cu_fp8.clone()
+
+ sched = torch.profiler.schedule(wait=0, warmup=warmup, active=1)
+
+ # Profile BF16 (warmup handled by profiler schedule)
+ with profile(
+ activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+ schedule=sched,
+ on_trace_ready=_make_trace_handler(
+ output_dir,
+ f"bf16_h{head_dim}_s{seq_len}",
+ f"BF16 Attention (seq_len={seq_len}, heads={num_heads}, "
+ f"head_dim={head_dim})",
+ ),
+ ) as prof_bf16:
+ for _ in range(warmup + 1):
+ with record_function("bf16_attention"):
+ attn_bf16._forward_flashinfer(
+ q.clone(), k.clone(), v.clone(), cu_bf16, max_s, seq_l
+ )
+ torch.accelerator.synchronize()
+ prof_bf16.step()
+
+ # Profile FP8 (warmup handled by profiler schedule)
+ with profile(
+ activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+ schedule=sched,
+ on_trace_ready=_make_trace_handler(
+ output_dir,
+ f"fp8_h{head_dim}_s{seq_len}",
+ f"FP8 Attention (seq_len={seq_len}, heads={num_heads}, "
+ f"head_dim={head_dim})",
+ ),
+ ) as prof_fp8:
+ for _ in range(warmup + 1):
+ with record_function("fp8_attention"):
+ attn_fp8._forward_flashinfer(
+ q.clone(), k.clone(), v.clone(), cu_fp8, max_s, seq_l
+ )
+ torch.accelerator.synchronize()
+ prof_fp8.step()
+
+ print(f"\nTensorBoard traces saved to: {output_dir}")
+ print(f"View with: tensorboard --logdir={output_dir}")
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(description="Benchmark FP8 vs BF16 ViT attention.")
+ parser.add_argument(
+ "--seq-lens",
+ type=int,
+ nargs="+",
+ default=DEFAULT_SEQ_LENS,
+ help="Sequence lengths to benchmark",
+ )
+ parser.add_argument(
+ "--num-heads",
+ type=int,
+ default=NUM_HEADS,
+ )
+ parser.add_argument(
+ "--head-dim",
+ type=int,
+ default=HEAD_DIM,
+ )
+ parser.add_argument(
+ "--method",
+ choices=["cupti", "cudagraph"],
+ default="cudagraph",
+ help="GPU timing method: cupti (CUPTI kernel timing) or "
+ "cudagraph (CUDA graph capture/replay). Default: cudagraph",
+ )
+ parser.add_argument(
+ "--warmup",
+ type=int,
+ default=10,
+ help="Warmup iterations (profile mode only)",
+ )
+ parser.add_argument(
+ "--profile",
+ action="store_true",
+ help="Run PyTorch profiler instead of benchmark",
+ )
+ parser.add_argument(
+ "--profile-seq-len",
+ type=int,
+ default=8192,
+ help="Sequence length for profiling (default: 8192)",
+ )
+ parser.add_argument(
+ "--profile-output-dir",
+ type=str,
+ default="./profile_traces",
+ help="Output directory for TensorBoard traces (default: ./profile_traces)",
+ )
+ args = parser.parse_args()
+
+ if args.profile:
+ run_profile(
+ args.profile_seq_len,
+ args.num_heads,
+ args.head_dim,
+ args.warmup,
+ args.profile_output_dir,
+ )
+ else:
+ run_benchmark(
+ args.seq_lens,
+ args.num_heads,
+ args.head_dim,
+ args.method,
+ )
diff --git a/benchmarks/kernels/cpu/benchmark_cpu_attn.py b/benchmarks/kernels/cpu/benchmark_cpu_attn.py
index 63d034278c7e..08afd693c333 100644
--- a/benchmarks/kernels/cpu/benchmark_cpu_attn.py
+++ b/benchmarks/kernels/cpu/benchmark_cpu_attn.py
@@ -12,7 +12,6 @@
cpu_attn_get_scheduler_metadata,
cpu_attn_reshape_and_cache,
)
-from vllm.platforms import CpuArchEnum, current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa
@@ -22,15 +21,14 @@ def get_attn_isa(
block_size: int | None = None,
dtype: torch.dtype | None = None,
):
- if block_size and dtype:
- return _get_attn_isa(dtype, block_size)
- else:
- if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
- return "neon"
- elif torch.cpu._is_amx_tile_supported():
- return "amx"
- else:
- return "vec"
+ # Delegate to _get_attn_isa so the fallback path applies the same arch
+ # gating (e.g. RISC-V RVV is only chosen when the build's hardcoded
+ # VLEN=128 kernel is actually present; on VLEN=256 / scalar hosts it
+ # correctly falls through to vec/vec16).
+ return _get_attn_isa(
+ dtype if dtype is not None else torch.bfloat16,
+ block_size if block_size else 32,
+ )
# rand number generation takes too much time, cache rand tensors
@@ -235,7 +233,7 @@ def rint(lo: int, hi: int) -> int:
)
parser.add_argument("--use-sink", action="store_true")
parser.add_argument(
- "--isa", type=str, choices=["vec", "neon", "amx", "vec16"], default=None
+ "--isa", type=str, choices=["vec", "neon", "amx", "vec16", "rvv"], default=None
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--iters", type=int, default=20)
diff --git a/tests/entrypoints/pooling/pooling/__init__.py b/benchmarks/kernels/ir/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/pooling/__init__.py
rename to benchmarks/kernels/ir/__init__.py
diff --git a/benchmarks/kernels/ir/bench_ir_ops.py b/benchmarks/kernels/ir/bench_ir_ops.py
new file mode 100644
index 000000000000..b23c4e8ae327
--- /dev/null
+++ b/benchmarks/kernels/ir/bench_ir_ops.py
@@ -0,0 +1,378 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Generic benchmark harness for vLLM IR ops.
+
+Usage:
+ python benchmarks/kernels/ir/bench_ir_ops.py
+ python benchmarks/kernels/ir/bench_ir_ops.py --ops rms_norm
+ python benchmarks/kernels/ir/bench_ir_ops.py --ops rms_norm,silu_mul
+ python benchmarks/kernels/ir/bench_ir_ops.py --no-cuda-graph
+ python benchmarks/kernels/ir/bench_ir_ops.py --ops rms_norm --save-path ./results/
+"""
+
+import argparse
+import contextlib
+import csv
+import dataclasses
+import datetime
+import math
+import os
+import subprocess
+import sys
+import tempfile
+
+# Ensure repo root is on sys.path so `benchmarks` is importable as a package.
+_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
+if _REPO_ROOT not in sys.path:
+ sys.path.insert(0, _REPO_ROOT)
+
+# Suppress noisy C++ warnings from vllm kernel registration (written to fd 2
+# directly by the dynamic linker, so Python-level sys.stderr redirect won't
+# catch them).
+_saved_fd = os.dup(2)
+try:
+ with open(os.devnull, "w") as _devnull:
+ os.dup2(_devnull.fileno(), 2)
+ import torch
+
+ import vllm.kernels # noqa: E402, F401
+finally:
+ os.dup2(_saved_fd, 2)
+ os.close(_saved_fd)
+
+from tqdm import tqdm # noqa: E402
+
+from benchmarks.kernels.ir.shapes import SHAPE_CONFIGS # noqa: E402 # isort: skip
+from vllm.ir.op import IrOp # noqa: E402
+from vllm.platforms import current_platform # noqa: E402
+from vllm.triton_utils import triton # noqa: E402
+
+
+@dataclasses.dataclass(frozen=True)
+class BenchConfig:
+ use_cuda_graph: bool = True
+ warmup: int = 25
+ rep: int = 100
+
+
+def _pkg_version(name: str) -> str:
+ from importlib.metadata import PackageNotFoundError, version
+
+ with contextlib.suppress(PackageNotFoundError):
+ return version(name)
+ return "not installed"
+
+
+_METADATA_LABELS = {
+ "timestamp": "Timestamp",
+ "git_commit": "Git commit",
+ "vllm": "vLLM",
+ "pytorch": "PyTorch",
+ "cuda_runtime": "CUDA runtime",
+ "triton": "Triton",
+ "cutlass": "CUTLASS",
+ "helion": "Helion",
+ "device": "Device",
+ "bench_mode": "Bench mode",
+ "warmup": "Warmup",
+ "rep": "Repetitions",
+}
+
+
+def collect_env_metadata(cfg: BenchConfig) -> dict[str, str]:
+ from vllm.collect_env import get_env_info
+
+ env = get_env_info()
+
+ git_sha = "unknown"
+ with contextlib.suppress(subprocess.CalledProcessError, FileNotFoundError):
+ git_sha = (
+ subprocess.check_output(
+ ["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.DEVNULL
+ )
+ .decode()
+ .strip()
+ )
+
+ device_name = current_platform.get_device_name()
+
+ warmup_note = " ms" if not cfg.use_cuda_graph else " ms (ignored)"
+ rep_note = " replays" if cfg.use_cuda_graph else " ms"
+
+ return {
+ "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ "git_commit": git_sha,
+ "vllm": str(env.vllm_version),
+ "pytorch": str(env.torch_version),
+ "cuda_runtime": str(env.cuda_runtime_version),
+ "triton": triton.__version__,
+ "cutlass": _pkg_version("nvidia-cutlass-dsl"),
+ "helion": _pkg_version("helion"),
+ "device": device_name,
+ "bench_mode": "cuda_graph" if cfg.use_cuda_graph else "eager",
+ "warmup": f"{cfg.warmup}{warmup_note}",
+ "rep": f"{cfg.rep}{rep_note}",
+ }
+
+
+def print_metadata(metadata: dict[str, str]):
+ print("=" * 60)
+ for key, val in metadata.items():
+ print(f"{_METADATA_LABELS.get(key, key) + ':':<16}{val}")
+ print("=" * 60)
+
+
+def _clone_args(args: tuple) -> tuple:
+ return tuple(a.clone() if isinstance(a, torch.Tensor) else a for a in args)
+
+
+# TODO(gmagogsfm): When the `maybe_inplace` PR lands, ops marked as
+# inplace=True will mutate bench_args across iterations. Both CUDA graph
+# and eager modes will accumulate drift from repeated in-place mutation.
+# We need to re-clone inputs per iteration for inplace ops.
+def _bench_one(fn, args, cfg: BenchConfig) -> float:
+ bench_args = _clone_args(args)
+ bench_fn = lambda: fn(*bench_args)
+
+ if cfg.use_cuda_graph:
+ ms = triton.testing.do_bench_cudagraph(bench_fn, rep=cfg.rep, quantiles=[0.5])
+ else:
+ ms = triton.testing.do_bench(
+ bench_fn, warmup=cfg.warmup, rep=cfg.rep, quantiles=[0.5]
+ )
+ return ms * 1000
+
+
+# TODO(gmagogsfm): Once compiled native implementation lands (#38775),
+# the benchmark baseline should be the compiled native (what vLLM runs by
+# default) rather than the uncompiled native implementation.
+def collect_timings(
+ op: IrOp, shape_configs: list[dict], cfg: BenchConfig
+) -> tuple[list[str], list[str], dict[str, dict[str, float]]]:
+ def fmt(v) -> str:
+ return str(v).split(".")[-1] if isinstance(v, torch.dtype) else str(v)
+
+ case_names = [
+ "_".join(f"{k}={fmt(v)}" for k, v in kwargs.items()) for kwargs in shape_configs
+ ]
+ providers = [n for n, impl in op.impls.items() if impl.supported]
+
+ results: dict[str, dict[str, float]] = {c: {} for c in case_names}
+ for provider in providers:
+ impl = op.impls[provider]
+ desc = f"{op.name} / {provider}"
+ for case_name, kwargs in tqdm(
+ zip(case_names, shape_configs),
+ desc=desc,
+ total=len(case_names),
+ unit=" cases",
+ ):
+ args = op.generate_inputs(**kwargs)
+ if impl.supports_args(*args):
+ results[case_name][provider] = _bench_one(impl.impl_fn, args, cfg)
+ else:
+ results[case_name][provider] = float("nan")
+
+ return case_names, providers, results
+
+
+def analyze_results(
+ op_name: str,
+ case_names: list[str],
+ providers: list[str],
+ results: dict[str, dict[str, float]],
+) -> tuple[list[dict[str, str]], list[dict[str, str]], list[str]]:
+ native_col = "native"
+ non_native = [p for p in providers if p != native_col]
+
+ header_cols = ["case"]
+ for p in providers:
+ header_cols.append(f"{p} (us)")
+ for p in non_native:
+ header_cols.append(f"{p} speedup")
+
+ detail_rows: list[dict[str, str]] = []
+ speedup_data: dict[str, list[tuple[float, str]]] = {p: [] for p in non_native}
+
+ for case_name in case_names:
+ timings = results[case_name]
+ row: dict[str, str] = {"case": case_name}
+
+ for p in providers:
+ val = timings.get(p, float("nan"))
+ row[f"{p} (us)"] = f"{val:.2f}" if not math.isnan(val) else "n/a"
+
+ native_us = timings.get(native_col, float("nan"))
+ for p in non_native:
+ p_us = timings.get(p, float("nan"))
+ if not math.isnan(native_us) and not math.isnan(p_us) and p_us > 0:
+ speedup = native_us / p_us
+ row[f"{p} speedup"] = f"{speedup:.2f}x"
+ speedup_data[p].append((speedup, case_name))
+ else:
+ row[f"{p} speedup"] = "n/a"
+
+ detail_rows.append(row)
+
+ summary_rows: list[dict[str, str]] = []
+ for p in non_native:
+ entries = speedup_data[p]
+ if not entries:
+ continue
+ speedups = [s for s, _ in entries]
+ geomean = math.exp(sum(math.log(s) for s in speedups) / len(speedups))
+ best_val, best_case = max(entries)
+ worst_val, worst_case = min(entries)
+ wins = sum(1 for s in speedups if s > 1.0)
+ losses = sum(1 for s in speedups if s < 1.0)
+ total = len(speedups)
+
+ print(f"\n{p} vs native ({wins}/{total} faster, {losses}/{total} slower):")
+ print(f" geomean speedup: {geomean:.2f}x")
+ print(f" best: {best_val:.2f}x ({best_case})")
+ print(f" worst: {worst_val:.2f}x ({worst_case})")
+
+ summary_rows.append(
+ {
+ "op": op_name,
+ "provider": p,
+ "geomean_speedup": f"{geomean:.2f}",
+ "best_speedup": f"{best_val:.2f}",
+ "best_case": best_case,
+ "worst_speedup": f"{worst_val:.2f}",
+ "worst_case": worst_case,
+ "wins": str(wins),
+ "losses": str(losses),
+ "total": str(total),
+ }
+ )
+
+ return detail_rows, summary_rows, header_cols
+
+
+def write_csv(path: str, rows: list[dict[str, str]], fieldnames: list[str]):
+ with open(path, "w", newline="") as f:
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
+ writer.writeheader()
+ writer.writerows(rows)
+
+
+def save_results(
+ save_dir: str,
+ op_name: str,
+ detail_rows: list[dict[str, str]],
+ header_cols: list[str],
+ all_summary_rows: list[dict[str, str]],
+ metadata: dict[str, str],
+):
+ write_csv(
+ os.path.join(save_dir, f"{op_name}_detail.csv"),
+ detail_rows,
+ header_cols,
+ )
+ if all_summary_rows:
+ write_csv(
+ os.path.join(save_dir, "summary.csv"),
+ all_summary_rows,
+ list(all_summary_rows[0].keys()),
+ )
+ write_csv(
+ os.path.join(save_dir, "metadata.csv"),
+ [metadata],
+ list(metadata.keys()),
+ )
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Benchmark vLLM IR ops")
+ parser.add_argument(
+ "--ops",
+ type=str,
+ default=None,
+ help="Comma-separated list of op names to benchmark (substring match)",
+ )
+ parser.add_argument(
+ "--no-cuda-graph",
+ action="store_true",
+ help="Disable CUDA graph; use do_bench with L2 cache flushing instead",
+ )
+ parser.add_argument(
+ "--warmup",
+ type=int,
+ default=25,
+ help="Warmup time in ms (do_bench) or ignored with CUDA graph (default: 25)",
+ )
+ parser.add_argument(
+ "--rep",
+ type=int,
+ default=100,
+ help="Repetition time in ms (do_bench) or number of graph replays "
+ "(do_bench_cudagraph) (default: 100)",
+ )
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default=None,
+ help="Directory to save results (default: auto-created temp dir)",
+ )
+ return parser.parse_args()
+
+
+def main():
+ args = parse_args()
+ cfg = BenchConfig(
+ use_cuda_graph=not args.no_cuda_graph,
+ warmup=args.warmup,
+ rep=args.rep,
+ )
+
+ torch.set_default_device(current_platform.device_type)
+
+ metadata = collect_env_metadata(cfg)
+ print_metadata(metadata)
+
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+ save_dir = args.save_path or os.path.join(
+ tempfile.gettempdir(), f"vllm_ir_bench_{timestamp}"
+ )
+ os.makedirs(save_dir, exist_ok=True)
+
+ op_filters = [f.strip() for f in args.ops.split(",")] if args.ops else None
+ all_summary_rows: list[dict[str, str]] = []
+
+ for op in IrOp.registry.values():
+ if op_filters and not any(f in op.name for f in op_filters):
+ continue
+ if not op.has_input_generator:
+ print(f"Skipping op '{op.name}': no input generator registered")
+ continue
+ if op.name not in SHAPE_CONFIGS:
+ raise RuntimeError(
+ f"No benchmark shape config for op '{op.name}'. "
+ f"Add it to benchmarks/kernels/ir/shapes.py"
+ )
+
+ case_names, providers, results = collect_timings(
+ op, SHAPE_CONFIGS[op.name], cfg
+ )
+ detail_rows, summary_rows, header_cols = analyze_results(
+ op.name, case_names, providers, results
+ )
+ all_summary_rows.extend(summary_rows)
+
+ save_results(
+ save_dir,
+ op.name,
+ detail_rows,
+ header_cols,
+ all_summary_rows,
+ metadata,
+ )
+
+ print(f"\nResults saved to: {save_dir}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/ir/shapes.py b/benchmarks/kernels/ir/shapes.py
new file mode 100644
index 000000000000..6cc44cf6cec1
--- /dev/null
+++ b/benchmarks/kernels/ir/shapes.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Shape configurations for IR op benchmarks.
+"""
+
+import torch
+
+NUM_TOKENS = [1, 2, 4, 16, 64, 256, 1024, 4096, 16384]
+COMMON_HIDDEN_SIZES = [
+ 2048, # Llama 3.2 1B, Qwen 3 MoE 30B-A3B, Gemma 3n
+ 3072, # Gemma 7B/9B
+ 4096, # Llama 3 8B, Qwen 3 8B, Mistral 7B
+ 5120, # Llama 4 Scout 17B-16E
+ 7168, # DeepSeek V3
+ 8192, # Llama 3 70B
+ 16384, # Llama 3 405B
+]
+
+# Each entry maps an op name to a list of kwarg dicts that will be passed
+# to that op's registered input generator via op.generate_inputs(**kwargs).
+SHAPE_CONFIGS: dict[str, list[dict]] = {
+ "rms_norm": [
+ {"num_tokens": n, "hidden_size": d, "dtype": dtype}
+ for dtype in [torch.float16, torch.bfloat16, torch.float32]
+ for d in COMMON_HIDDEN_SIZES
+ for n in NUM_TOKENS
+ ],
+}
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 881039f43f07..2f56099c66fd 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -217,6 +217,7 @@ async def send_request(
min_tokens: int | None = None,
max_tokens: int | None = None,
timeout_sec: int = 120,
+ conversation_id: str | None = None,
) -> ServerResponse:
payload = {
"model": model,
@@ -225,6 +226,9 @@ async def send_request(
"temperature": 0.0,
}
+ if conversation_id is not None:
+ payload["conversation_id"] = conversation_id
+
if stream:
payload["stream"] = True
payload["stream_options"] = {"include_usage": False}
@@ -419,6 +423,7 @@ async def send_turn(
min_tokens,
max_tokens,
req_args.timeout_sec,
+ conversation_id=conv_id,
)
if response.valid is False:
@@ -1468,6 +1473,12 @@ async def main() -> None:
"(for example: --warmup-percentages=0%%,50%%)",
)
+ parser.add_argument(
+ "--trust-remote-code",
+ action="store_true",
+ help="Trust remote code when loading the tokenizer.",
+ )
+
args = parser.parse_args()
logger.info(args)
@@ -1510,7 +1521,9 @@ async def main() -> None:
np.random.seed(args.seed)
logger.info("Loading tokenizer")
- tokenizer = AutoTokenizer.from_pretrained(args.model)
+ tokenizer = AutoTokenizer.from_pretrained(
+ args.model, trust_remote_code=args.trust_remote_code
+ )
await get_server_info(args.url)
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 8535186cc1ec..ffab4015f495 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,7 +1,7 @@
include(FetchContent)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -32,18 +32,23 @@ else()
"-DVLLM_CPU_EXTENSION")
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
- # and create a local shim dir with it
+ # and create a local shim dir with it. When PyTorch is built from source or packaged
+ # by a distro (common on RISC-V, s390x, Fedora/RHEL aarch64), no vendored libgomp
+ # exists and the shim dir is empty; fall back to the system libgomp in that case.
vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
- find_library(OPEN_MP
- NAMES gomp
- PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
- NO_DEFAULT_PATH
- REQUIRED
- )
- # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
- if (OPEN_MP)
+ if(VLLM_TORCH_GOMP_SHIM_DIR)
+ find_library(OPEN_MP
+ NAMES gomp
+ PATHS "${VLLM_TORCH_GOMP_SHIM_DIR}"
+ NO_DEFAULT_PATH
+ REQUIRED
+ )
+ # Use the same libgomp as PyTorch at runtime
set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
+ else()
+ # Fall back to system / toolchain libgomp
+ find_library(OPEN_MP NAMES gomp REQUIRED)
endif()
endif()
@@ -190,10 +195,12 @@ elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
endif()
if(VLLM_RVV_VLEN AND VLLM_RVV_VLEN GREATER 0)
message(STATUS "RISC-V RVV VLEN=${VLLM_RVV_VLEN}")
+ # Sources gate FP16/BF16 paths on the compiler-provided
+ # __riscv_zvfh / __riscv_zvfbfmin macros, which GCC and clang
+ # define automatically when those extensions appear in -march.
if(RVV_BF16_FOUND)
message(STATUS "BF16 extension detected")
set(MARCH_FLAGS -march=rv64gcv_zvfh_zfbfmin_zvfbfmin_zvl${VLLM_RVV_VLEN}b -mrvv-vector-bits=zvl -mabi=lp64d)
- add_compile_definitions(RISCV_BF16_SUPPORT)
elseif(RVV_FP16_FOUND)
message(WARNING "BF16 functionality is not available")
set(MARCH_FLAGS -march=rv64gcv_zvfh_zvl${VLLM_RVV_VLEN}b -mrvv-vector-bits=zvl -mabi=lp64d)
@@ -321,14 +328,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
set(ONEDNN_VERBOSE "ON")
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
- # TODO: Refactor this
- if (ENABLE_X86_ISA)
- # Note: only enable oneDNN for AVX512
- list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512})
- else()
- list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS})
- endif()
-
set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
FetchContent_MakeAvailable(oneDNN)
@@ -341,8 +340,14 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
PRIVATE ${oneDNN_SOURCE_DIR}/src
)
target_link_libraries(dnnl_ext dnnl torch)
- target_compile_options(dnnl_ext PRIVATE ${DNNL_COMPILE_FLAGS} -fPIC)
+ if (ENABLE_X86_ISA)
+ target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS_AVX2} -fPIC)
+ else()
+ target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
+ endif()
list(APPEND LIBS dnnl_ext)
+
+
set(USE_ONEDNN ON)
else()
set(USE_ONEDNN OFF)
@@ -406,12 +411,15 @@ endif()
if (ENABLE_X86_ISA)
set(VLLM_EXT_SRC_SGL
+ "csrc/cpu/sgl-kernels/fla.cpp"
+ "csrc/cpu/sgl-kernels/conv.cpp"
"csrc/cpu/sgl-kernels/gemm.cpp"
"csrc/cpu/sgl-kernels/gemm_int8.cpp"
"csrc/cpu/sgl-kernels/gemm_fp8.cpp"
"csrc/cpu/sgl-kernels/gemm_int4.cpp"
"csrc/cpu/sgl-kernels/moe.cpp"
"csrc/cpu/sgl-kernels/moe_int8.cpp"
+ "csrc/cpu/sgl-kernels/moe_int4.cpp"
"csrc/cpu/sgl-kernels/moe_fp8.cpp")
set(VLLM_EXT_SRC_AVX512
@@ -430,10 +438,11 @@ if (ENABLE_X86_ISA)
"csrc/cpu/pos_encoding.cpp"
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
- set(VLLM_EXT_SRC_AVX2
+ set(VLLM_EXT_SRC_AVX2
"csrc/cpu/utils.cpp"
"csrc/cpu/spec_decode_utils.cpp"
"csrc/cpu/cpu_attn.cpp"
+ "csrc/cpu/dnnl_kernels.cpp"
"csrc/cpu/torch_bindings.cpp"
# TODO: Remove these files
"csrc/cpu/activation.cpp"
@@ -448,7 +457,7 @@ if (ENABLE_X86_ISA)
set(_C_LIBS numa dnnl_ext)
set(_C_AVX512_LIBS numa dnnl_ext)
- set(_C_AVX2_LIBS numa)
+ set(_C_AVX2_LIBS numa dnnl_ext)
# AMX + AVX512F + AVX512BF16 + AVX512VNNI
define_extension_target(
diff --git a/cmake/external_projects/deepgemm.cmake b/cmake/external_projects/deepgemm.cmake
index c3a48a64fc77..183c42dc7953 100644
--- a/cmake/external_projects/deepgemm.cmake
+++ b/cmake/external_projects/deepgemm.cmake
@@ -20,7 +20,7 @@ else()
FetchContent_Declare(
deepgemm
GIT_REPOSITORY https://github.com/deepseek-ai/DeepGEMM.git
- GIT_TAG 477618cd51baffca09c4b0b87e97c03fe827ef03
+ GIT_TAG 891d57b4db1071624b5c8fa0d1e51cb317fa709f
GIT_SUBMODULES "third-party/cutlass" "third-party/fmt"
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
@@ -53,49 +53,80 @@ cuda_archs_loose_intersection(DEEPGEMM_ARCHS
if(DEEPGEMM_ARCHS)
message(STATUS "DeepGEMM CUDA architectures: ${DEEPGEMM_ARCHS}")
- find_package(CUDAToolkit REQUIRED)
-
#
- # Build the _C pybind11 extension from DeepGEMM's C++ source.
- # This is a CXX-only module — CUDA kernels are JIT-compiled at runtime.
+ # DeepGEMM integration notes
+ # --------------------------
+ # We vendor DeepGEMM into vllm/third_party/deep_gemm/ and bundle a
+ # `_C.cpython-X.Y-*.so` for every CPython in `requires-python`. The
+ # per-Python build is delegated to tools/build_deepgemm_C.py.
+ #
+ # Why per-Python: DeepGEMM's binding uses PYBIND11_MODULE, which links
+ # private CPython symbols — a single `_C.abi3.so` is not viable today
+ # (see #41476 / #41512 for the failed attempt).
+ #
+ # TODOs (tracked in vllm-project/vllm#42431):
+ # - Replace DeepGEMM's pybind11 binding with a TORCH_LIBRARY + shim
+ # binding (cf. vllm-flash-attention/csrc/common/pytorch_shim.h) to
+ # collapse to one `_C.abi3.so`. Needs either an upstream change or
+ # a maintained binding fork in vLLM.
+ # - AOT-compile DeepGEMM's CUDA kernels instead of runtime JIT to drop
+ # the vendored CUTLASS/CCCL headers and the CUDA-toolkit-at-runtime
+ # requirement.
#
- Python_add_library(_deep_gemm_C MODULE WITH_SOABI
- "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp")
-
- # The pybind11 module name must be _C to match DeepGEMM's Python imports.
- set_target_properties(_deep_gemm_C PROPERTIES OUTPUT_NAME "_C")
-
- target_compile_definitions(_deep_gemm_C PRIVATE
- "-DTORCH_EXTENSION_NAME=_C")
-
- target_include_directories(_deep_gemm_C PRIVATE
- "${deepgemm_SOURCE_DIR}/csrc"
- "${deepgemm_SOURCE_DIR}/deep_gemm/include"
- "${deepgemm_SOURCE_DIR}/third-party/cutlass/include"
- "${deepgemm_SOURCE_DIR}/third-party/cutlass/tools/util/include"
- "${deepgemm_SOURCE_DIR}/third-party/fmt/include")
-
- target_compile_options(_deep_gemm_C PRIVATE
- $<$:-std=c++17>
- $<$:-O3>
- $<$:-Wno-psabi>
- $<$:-Wno-deprecated-declarations>)
-
- # torch_python is required because DeepGEMM uses pybind11 type casters
- # for at::Tensor (via PYBIND11_MODULE), unlike vLLM's own extensions which
- # use torch::Library custom ops.
- find_library(TORCH_PYTHON_LIBRARY torch_python
- PATHS "${TORCH_INSTALL_PREFIX}/lib"
- REQUIRED)
-
- target_link_libraries(_deep_gemm_C PRIVATE
- torch ${TORCH_LIBRARIES} "${TORCH_PYTHON_LIBRARY}"
- CUDA::cudart CUDA::nvrtc)
-
- # Install the shared library into the vendored package directory
- install(TARGETS _deep_gemm_C
- LIBRARY DESTINATION vllm/third_party/deep_gemm
- COMPONENT _deep_gemm_C)
+
+ # DEEPGEMM_PYTHON_INTERPRETERS: ":"-separated target Python paths.
+ # Empty/unset → fall back to the build interpreter (editable installs).
+ # (Empty-but-set env vars test as DEFINED in cmake — treat as unset.)
+ if(NOT "$ENV{DEEPGEMM_PYTHON_INTERPRETERS}" STREQUAL "")
+ string(REPLACE ":" ";" _dg_pythons "$ENV{DEEPGEMM_PYTHON_INTERPRETERS}")
+ else()
+ set(_dg_pythons "${Python_EXECUTABLE}")
+ endif()
+ message(STATUS "DeepGEMM _C will be built for: ${_dg_pythons}")
+
+ # add_custom_command does no implicit header scanning; glob explicitly so
+ # header-only edits in DeepGEMM/cutlass/fmt re-trigger the rebuild.
+ file(GLOB_RECURSE _dg_headers
+ "${deepgemm_SOURCE_DIR}/csrc/*.h"
+ "${deepgemm_SOURCE_DIR}/csrc/*.hpp"
+ "${deepgemm_SOURCE_DIR}/deep_gemm/include/*.h"
+ "${deepgemm_SOURCE_DIR}/deep_gemm/include/*.hpp"
+ "${deepgemm_SOURCE_DIR}/deep_gemm/include/*.cuh")
+
+ set(_dg_markers)
+ set(_dg_seen_soabis)
+ foreach(_pybin IN LISTS _dg_pythons)
+ execute_process(
+ COMMAND "${_pybin}" -c
+ "import sysconfig; print(sysconfig.get_config_var('SOABI'))"
+ OUTPUT_VARIABLE _dg_soabi
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ COMMAND_ERROR_IS_FATAL ANY)
+ # Dedup interpreters that resolve to the same CPython.
+ if(_dg_soabi IN_LIST _dg_seen_soabis)
+ continue()
+ endif()
+ list(APPEND _dg_seen_soabis "${_dg_soabi}")
+ set(_dg_dir "${CMAKE_CURRENT_BINARY_DIR}/deepgemm_C_${_dg_soabi}")
+ set(_dg_marker "${_dg_dir}/.built")
+ add_custom_command(
+ OUTPUT "${_dg_marker}"
+ COMMAND "${Python_EXECUTABLE}"
+ "${CMAKE_SOURCE_DIR}/tools/build_deepgemm_C.py"
+ "${deepgemm_SOURCE_DIR}" "${_dg_dir}" "${_pybin}"
+ COMMAND "${CMAKE_COMMAND}" -E touch "${_dg_marker}"
+ DEPENDS "${CMAKE_SOURCE_DIR}/tools/build_deepgemm_C.py"
+ "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp"
+ ${_dg_headers}
+ COMMENT "Building DeepGEMM _C for ${_pybin}"
+ VERBATIM)
+ list(APPEND _dg_markers "${_dg_marker}")
+ install(DIRECTORY "${_dg_dir}/"
+ DESTINATION vllm/third_party/deep_gemm
+ COMPONENT _deep_gemm_C
+ FILES_MATCHING PATTERN "_C.cpython-*.so")
+ endforeach()
+ add_custom_target(_deep_gemm_C ALL DEPENDS ${_dg_markers})
#
# Vendor DeepGEMM Python package files
@@ -120,6 +151,11 @@ if(DEEPGEMM_ARCHS)
COMPONENT _deep_gemm_C
FILES_MATCHING PATTERN "*.py")
+ install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/mega/"
+ DESTINATION vllm/third_party/deep_gemm/mega
+ COMPONENT _deep_gemm_C
+ FILES_MATCHING PATTERN "*.py")
+
# Generate envs.py (normally generated by DeepGEMM's setup.py build step)
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/deep_gemm_envs.py"
"# Pre-installed environment variables\npersistent_envs = dict()\n")
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index 0f16b9161fa3..65986df55012 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
- GIT_TAG 692917b1cda61b93ac9ee2d846ec54e75afe87b1
+ GIT_TAG a6ec2ba7bd0a7dff98b3f4d3e6b52b159c48d78b
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 4aef7282134b..b38917a7b0b5 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -39,7 +39,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG f5bc33cfc02c744d24a2e9d50e6db656de40611c
+ GIT_TAG bce29425653ec0fbc579d329883030e832d15ada
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 758a77795553..303433392c32 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -11,29 +11,74 @@
namespace vllm {
template
+ bool act_first, bool HAS_CLAMP>
__device__ __forceinline__ scalar_t compute(const scalar_t& x,
- const scalar_t& y) {
- return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
+ const scalar_t& y,
+ const float limit) {
+ if constexpr (act_first) {
+ scalar_t gate = x;
+ scalar_t up = y;
+ if constexpr (HAS_CLAMP) {
+ gate = (scalar_t)fminf((float)gate, limit);
+ up = (scalar_t)fmaxf(fminf((float)up, limit), -limit);
+ }
+ return ACT_FN(gate) * up;
+ } else {
+ scalar_t gate = x;
+ scalar_t up = y;
+ if constexpr (HAS_CLAMP) {
+ gate = (scalar_t)fmaxf(fminf((float)gate, limit), -limit);
+ up = (scalar_t)fminf((float)up, limit);
+ }
+ return gate * ACT_FN(up);
+ }
}
template
+ bool act_first, bool HAS_CLAMP>
__device__ __forceinline__ packed_t packed_compute(const packed_t& x,
- const packed_t& y) {
- return act_first ? packed_mul(PACKED_ACT_FN(x), y)
- : packed_mul(x, PACKED_ACT_FN(y));
+ const packed_t& y,
+ const float limit) {
+ if constexpr (act_first) {
+ packed_t gate = x;
+ packed_t up = y;
+ if constexpr (HAS_CLAMP) {
+ float2 g = cast_to_float2(gate);
+ float2 u = cast_to_float2(up);
+ g.x = fminf(g.x, limit);
+ g.y = fminf(g.y, limit);
+ u.x = fmaxf(fminf(u.x, limit), -limit);
+ u.y = fmaxf(fminf(u.y, limit), -limit);
+ gate = cast_to_packed(g);
+ up = cast_to_packed(u);
+ }
+ return packed_mul(PACKED_ACT_FN(gate), up);
+ } else {
+ packed_t gate = x;
+ packed_t up = y;
+ if constexpr (HAS_CLAMP) {
+ float2 g = cast_to_float2(gate);
+ float2 u = cast_to_float2(up);
+ g.x = fmaxf(fminf(g.x, limit), -limit);
+ g.y = fmaxf(fminf(g.y, limit), -limit);
+ u.x = fminf(u.x, limit);
+ u.y = fminf(u.y, limit);
+ gate = cast_to_packed(g);
+ up = cast_to_packed(u);
+ }
+ return packed_mul(gate, PACKED_ACT_FN(up));
+ }
}
// Activation and gating kernel template.
template
+ bool use_vec, bool HAS_CLAMP, bool use_256b = false>
__global__ void act_and_mul_kernel(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., 2, d]
- const int d) {
+ const int d, const float limit) {
const scalar_t* x_ptr = input + blockIdx.x * 2 * d;
const scalar_t* y_ptr = x_ptr + d;
scalar_t* out_ptr = out + blockIdx.x * d;
@@ -58,8 +103,9 @@ __global__ void act_and_mul_kernel(
}
#pragma unroll
for (int j = 0; j < pvec_t::NUM_ELTS; j++) {
- x.elts[j] = packed_compute(
- x.elts[j], y.elts[j]);
+ x.elts[j] =
+ packed_compute(
+ x.elts[j], y.elts[j], limit);
}
if constexpr (use_256b) {
st256(x, &out_vec[i]);
@@ -72,7 +118,8 @@ __global__ void act_and_mul_kernel(
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&x_ptr[idx]);
const scalar_t y = VLLM_LDG(&y_ptr[idx]);
- out_ptr[idx] = compute(x, y);
+ out_ptr[idx] =
+ compute(x, y, limit);
}
}
}
@@ -151,8 +198,11 @@ packed_gelu_tanh_kernel(const packed_t& val) {
// Launch activation and gating kernel.
// Use ACT_FIRST (bool) indicating whether to apply the activation function
-// first.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, PACKED_KERNEL, ACT_FIRST) \
+// first. HAS_CLAMP (bool) enables pre-activation clamping: gate input is
+// clamped (max only) and up input is clamped (both sides) before the
+// activation function is applied.
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, PACKED_KERNEL, ACT_FIRST, \
+ HAS_CLAMP, LIMIT) \
auto dtype = input.scalar_type(); \
int d = input.size(-1) / 2; \
int64_t num_tokens = input.numel() / input.size(-1); \
@@ -177,8 +227,8 @@ packed_gelu_tanh_kernel(const packed_t& val) {
scalar_t, typename vllm::PackedTypeConverter::Type, \
KERNEL, \
PACKED_KERNEL::Type>, \
- ACT_FIRST, true, true><<>>( \
- out.data_ptr(), input.data_ptr(), d); \
+ ACT_FIRST, true, HAS_CLAMP, true><<>>( \
+ out.data_ptr(), input.data_ptr(), d, LIMIT); \
}); \
} else { \
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
@@ -186,8 +236,8 @@ packed_gelu_tanh_kernel(const packed_t& val) {
scalar_t, typename vllm::PackedTypeConverter::Type, \
KERNEL, \
PACKED_KERNEL::Type>, \
- ACT_FIRST, true, false><<>>( \
- out.data_ptr(), input.data_ptr(), d); \
+ ACT_FIRST, true, HAS_CLAMP, false><<>>( \
+ out.data_ptr(), input.data_ptr(), d, LIMIT); \
}); \
} \
} else { \
@@ -197,8 +247,8 @@ packed_gelu_tanh_kernel(const packed_t& val) {
scalar_t, typename vllm::PackedTypeConverter::Type, \
KERNEL, \
PACKED_KERNEL::Type>, \
- ACT_FIRST, false><<>>( \
- out.data_ptr(), input.data_ptr(), d); \
+ ACT_FIRST, false, HAS_CLAMP><<>>( \
+ out.data_ptr(), input.data_ptr(), d, LIMIT); \
}); \
}
@@ -206,7 +256,14 @@ void silu_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel,
- true);
+ true, false, 0.0f);
+}
+
+void silu_and_mul_clamp(torch::Tensor& out, // [..., d]
+ torch::Tensor& input, // [..., 2 * d]
+ double limit) {
+ LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel,
+ true, true, (float)limit);
}
void mul_and_silu(torch::Tensor& out, // [..., d]
@@ -215,21 +272,21 @@ void mul_and_silu(torch::Tensor& out, // [..., d]
// The difference between mul_and_silu and silu_and_mul is that mul_and_silu
// applies the silu to the latter half of the input.
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel,
- false);
+ false, false, 0.0f);
}
void gelu_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, vllm::packed_gelu_kernel,
- true);
+ true, false, 0.0f);
}
void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
- LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel,
- vllm::packed_gelu_tanh_kernel, true);
+ LAUNCH_ACTIVATION_GATE_KERNEL(
+ vllm::gelu_tanh_kernel, vllm::packed_gelu_tanh_kernel, true, false, 0.0f);
}
namespace vllm {
diff --git a/csrc/attention/vertical_slash_index.cu b/csrc/attention/vertical_slash_index.cu
deleted file mode 100644
index c1b45b143f4e..000000000000
--- a/csrc/attention/vertical_slash_index.cu
+++ /dev/null
@@ -1,401 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include
-
-#include
-
-#include
-
-__device__ int64_t save_blocks(int* block_offset, int64_t range_start,
- int64_t range_end, int64_t block_size,
- int64_t input_block_count, int64_t kv_seqlen) {
- if (range_start >= kv_seqlen) {
- return input_block_count;
- }
- if (range_end > kv_seqlen) {
- range_end = kv_seqlen;
- }
- int64_t current_block_count = input_block_count;
- for (int idx = range_start; idx < range_end; idx += block_size) {
- block_offset[current_block_count++] = idx;
- }
- return current_block_count;
-}
-
-__global__ void convert_vertical_slash_indexes_kernel(
- const int* q_seqlens, // [BATCH, ]
- const int* kv_seqlens, // [BATCH, ]
- const int* vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
- const int* slash_indexes, // [BATCH, N_HEADS, NNZ_S]
- int* block_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* block_offset, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
- int* column_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* column_index, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
- int64_t N_HEADS, int64_t N_ROWS, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
- int64_t NNZ_V, int64_t NNZ_S,
- bool causal // True for intra, False for succ
-) {
- const int batch_idx = blockIdx.y;
- const int head_idx = blockIdx.x;
- const int group_idx = blockIdx.z;
-
- int64_t q_seqlen = q_seqlens[batch_idx];
- int64_t kv_seqlen = kv_seqlens[batch_idx];
- int64_t block_idx_m = group_idx * blockDim.x + threadIdx.x;
- int64_t start_m = block_idx_m * BLOCK_SIZE_M;
- if (start_m >= q_seqlen) {
- return;
- }
- int64_t end_m = start_m + BLOCK_SIZE_M;
- vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
- slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
- int64_t row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
- block_count += row_offset;
- block_offset += row_offset * NNZ_S;
- column_count += row_offset;
- column_index += row_offset * NNZ_V;
-
- bool has_slash = true;
- int64_t tmp_col_cnt = 0, tmp_blk_cnt = 0;
- int64_t s = 0, v = 0;
- int64_t v_idx = vertical_indexes[v++];
- int64_t s_idx = slash_indexes[s++];
- if (causal) {
- while (s_idx >= end_m + (kv_seqlen - q_seqlen) && s < NNZ_S) {
- s_idx = slash_indexes[s++];
- }
- if (s_idx > end_m + (kv_seqlen - q_seqlen)) has_slash = false;
- s_idx = max((kv_seqlen - q_seqlen) + end_m - s_idx, BLOCK_SIZE_M);
- } else {
- while (s_idx >= end_m + kv_seqlen && s < NNZ_S) {
- s_idx = slash_indexes[s++];
- }
- if (s_idx > end_m + kv_seqlen) has_slash = false;
- s_idx = max(kv_seqlen + end_m - s_idx, BLOCK_SIZE_M);
- }
-
- int64_t range_start = s_idx - BLOCK_SIZE_M, range_end = s_idx;
- if (!has_slash) {
- if (causal) {
- range_start = (kv_seqlen - q_seqlen) + end_m;
- range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
- } else {
- range_start = kv_seqlen;
- range_end = kv_seqlen + BLOCK_SIZE_N;
- }
- }
-
- bool slash_finished = false;
- while (1) {
- if (v_idx < range_end) {
- if (v_idx < range_start) {
- column_index[tmp_col_cnt++] = v_idx;
- }
- if (v < NNZ_V) {
- v_idx = vertical_indexes[v++];
- } else {
- if (causal)
- v_idx = end_m + BLOCK_SIZE_N + (kv_seqlen - q_seqlen);
- else
- v_idx = end_m + BLOCK_SIZE_N + kv_seqlen;
- }
- } else {
- if ((s < NNZ_S && causal) ||
- (s < NNZ_S && !causal && slash_indexes[s] >= start_m)) {
- if (causal)
- s_idx = max((kv_seqlen - q_seqlen) + end_m - slash_indexes[s++],
- BLOCK_SIZE_M);
- else
- s_idx = max(kv_seqlen + end_m - slash_indexes[s++], BLOCK_SIZE_M);
- } else {
- if (v == NNZ_V || (v_idx > range_start && causal)) {
- // add the last vertical if no more slash
- if (v == NNZ_V && !causal && v_idx < kv_seqlen) {
- column_index[tmp_col_cnt++] = v_idx;
- }
- tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
- BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
- break;
- } else {
- if (causal) {
- range_start = (kv_seqlen - q_seqlen) + end_m;
- range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
- } else {
- // if slash_finished but there are vertical left, save current
- // blocks
- tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
- BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
- range_start = kv_seqlen;
- range_end = kv_seqlen + BLOCK_SIZE_N;
- }
- slash_finished = true;
- }
- }
- if (!slash_finished) {
- if (s_idx > range_end + BLOCK_SIZE_M) {
- tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
- BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
- range_start = s_idx - BLOCK_SIZE_M;
- range_end = s_idx;
- } else if (s_idx > range_end) {
- range_end += BLOCK_SIZE_M;
- }
- }
- }
- }
-
- block_count[0] = tmp_blk_cnt;
- column_count[0] = tmp_col_cnt;
-}
-
-void convert_vertical_slash_indexes_64x64(
- const int* q_seqlens, // [BATCH, ]
- const int* kv_seqlens, // [BATCH, ]
- const int* vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
- const int* slash_indexes, // [BATCH, N_HEADS, NNZ_S]
- int* block_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* block_offset, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
- int* column_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* column_index, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
- int64_t BATCH_SIZE, int64_t N_HEADS, int64_t N_ROWS, int64_t BLOCK_SIZE_M,
- int64_t BLOCK_SIZE_N, int64_t NNZ_V, int64_t NNZ_S, bool causal) {
- const int N_THREADS = 64;
- const dim3 dimBlock(N_THREADS);
- const dim3 dimGrid(N_HEADS, BATCH_SIZE, (N_ROWS + N_THREADS - 1) / N_THREADS);
- convert_vertical_slash_indexes_kernel<<>>(
- q_seqlens, kv_seqlens, vertical_indexes, slash_indexes, block_count,
- block_offset, column_count, column_index, N_HEADS, N_ROWS, BLOCK_SIZE_M,
- BLOCK_SIZE_N, NNZ_V, NNZ_S, causal);
-}
-
-/**
- * Implements the Algorithm 4 in paper https://arxiv.org/abs/2407.02490.
- *
- * This function builds the index of each row of blocks from vertical indices
- * and slash indices. The vertical indices are treated as points, while the
- * slash indices are converted as ranges. The output consists of the merged
- * ranges and separate column indices, where the ranges are represented by
- * block indices.
- *
- * The implementation is referenced from the original MInference repo:
- * https://github.com/microsoft/MInference/blob/main/csrc/vertical_slash_index.cu.
- */
-void convert_vertical_slash_indexes(
- torch::Tensor& block_count, // [BATCH, N_HEADS, NUM_ROWS]
- torch::Tensor& block_offset, // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
- torch::Tensor& column_count, // [BATCH, N_HEADS, NUM_ROWS]
- torch::Tensor& column_index, // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
- torch::Tensor q_seqlens, // [BATCH, ]
- torch::Tensor kv_seqlens, // [BATCH, ]
- torch::Tensor vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
- torch::Tensor slash_indexes, // [BATCH, N_HEADS, NNZ_S]
- int64_t context_size, int64_t block_size_M, int64_t block_size_N,
- bool causal) {
- cudaSetDevice(q_seqlens.get_device());
-
- int batch_size = slash_indexes.size(0);
- int num_heads = slash_indexes.size(1);
- int nnz_slash = slash_indexes.size(2);
- int nnz_vertical = vertical_indexes.size(2);
- int num_rows = (context_size + block_size_M - 1) / block_size_M;
-
- convert_vertical_slash_indexes_64x64(
- q_seqlens.data_ptr(), kv_seqlens.data_ptr(),
- vertical_indexes.data_ptr(), slash_indexes.data_ptr(),
- block_count.data_ptr(), block_offset.data_ptr(),
- column_count.data_ptr(), column_index.data_ptr(), batch_size,
- num_heads, num_rows, block_size_M, block_size_N, nnz_vertical, nnz_slash,
- causal);
-}
-
-__global__ void convert_vertical_slash_indexes_kernel_mergehead(
- const int* q_seqlens, // [BATCH, ]
- const int* kv_seqlens, // [BATCH, ]
- const int* vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
- const int* slash_indexes, // [BATCH, N_HEADS, NNZ_S]
- const int* per_head_vertical_topkv, const int* per_head_slash_topkv,
- int* block_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* block_offset, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
- int* column_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* column_index, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
- int64_t N_HEADS, int64_t N_ROWS, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
- int64_t NNZ_V, int64_t NNZ_S,
- bool causal // True for intra, False for succ
-) {
- const int batch_idx = blockIdx.y;
- const int head_idx = blockIdx.x;
- const int group_idx = blockIdx.z;
-
- int64_t q_seqlen = q_seqlens[batch_idx];
- int64_t kv_seqlen = kv_seqlens[batch_idx];
- int64_t block_idx_m = group_idx * blockDim.x + threadIdx.x;
- int64_t start_m = block_idx_m * BLOCK_SIZE_M;
- if (start_m >= q_seqlen) {
- return;
- }
- int64_t end_m = start_m + BLOCK_SIZE_M;
- vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
- slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
- int64_t row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
- block_count += row_offset;
- block_offset += row_offset * NNZ_S;
- column_count += row_offset;
- column_index += row_offset * NNZ_V;
-
- // MergeHead: each head has it's unique max topk NNZ_V,NNZ_S. (NNZ_V,NNZ_S
- // above is buffer size, use to compute offset)
- NNZ_S = per_head_slash_topkv[head_idx];
- NNZ_V = per_head_vertical_topkv[head_idx];
-
- bool has_slash = true;
- int64_t tmp_col_cnt = 0, tmp_blk_cnt = 0;
- int64_t s = 0, v = 0;
- int64_t v_idx = vertical_indexes[v++];
- int64_t s_idx = slash_indexes[s++];
- if (causal) {
- while (s_idx >= end_m + (kv_seqlen - q_seqlen) && s < NNZ_S) {
- s_idx = slash_indexes[s++];
- }
- if (s_idx > end_m + (kv_seqlen - q_seqlen)) has_slash = false;
- s_idx = max((kv_seqlen - q_seqlen) + end_m - s_idx, BLOCK_SIZE_M);
- } else {
- while (s_idx >= end_m + kv_seqlen && s < NNZ_S) {
- s_idx = slash_indexes[s++];
- }
- if (s_idx > end_m + kv_seqlen) has_slash = false;
- s_idx = max(kv_seqlen + end_m - s_idx, BLOCK_SIZE_M);
- }
-
- int64_t range_start = s_idx - BLOCK_SIZE_M, range_end = s_idx;
- if (!has_slash) {
- if (causal) {
- range_start = (kv_seqlen - q_seqlen) + end_m;
- range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
- } else {
- range_start = kv_seqlen;
- range_end = kv_seqlen + BLOCK_SIZE_N;
- }
- }
-
- bool slash_finished = false;
- while (1) {
- if (v_idx < range_end) {
- if (v_idx < range_start) {
- column_index[tmp_col_cnt++] = v_idx;
- }
- if (v < NNZ_V) {
- v_idx = vertical_indexes[v++];
- } else {
- if (causal)
- v_idx = end_m + BLOCK_SIZE_N + (kv_seqlen - q_seqlen);
- else
- v_idx = end_m + BLOCK_SIZE_N + kv_seqlen;
- }
- } else {
- if ((s < NNZ_S && causal) ||
- (s < NNZ_S && !causal && slash_indexes[s] >= start_m)) {
- if (causal)
- s_idx = max((kv_seqlen - q_seqlen) + end_m - slash_indexes[s++],
- BLOCK_SIZE_M);
- else
- s_idx = max(kv_seqlen + end_m - slash_indexes[s++], BLOCK_SIZE_M);
- } else {
- if (v == NNZ_V || (v_idx > range_start && causal)) {
- // add the last vertical if no more slash
- if (v == NNZ_V && !causal && v_idx < kv_seqlen) {
- column_index[tmp_col_cnt++] = v_idx;
- }
- tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
- BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
- break;
- } else {
- if (causal) {
- range_start = (kv_seqlen - q_seqlen) + end_m;
- range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
- } else {
- // if slash_finished but there are vertical left, save current
- // blocks
- tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
- BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
- range_start = kv_seqlen;
- range_end = kv_seqlen + BLOCK_SIZE_N;
- }
- slash_finished = true;
- }
- }
- if (!slash_finished) {
- if (s_idx > range_end + BLOCK_SIZE_M) {
- tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
- BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
- range_start = s_idx - BLOCK_SIZE_M;
- range_end = s_idx;
- } else if (s_idx > range_end) {
- range_end += BLOCK_SIZE_M;
- }
- }
- }
- }
-
- block_count[0] = tmp_blk_cnt;
- column_count[0] = tmp_col_cnt;
-}
-
-void convert_vertical_slash_indexes_64x64_mergehead(
- const int* q_seqlens, // [BATCH, ]
- const int* kv_seqlens, // [BATCH, ]
- const int* vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
- const int* slash_indexes, // [BATCH, N_HEADS, NNZ_S]
- int* per_head_vertical_topkv, int* per_head_slash_topkv,
- int* block_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* block_offset, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
- int* column_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* column_index, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
- int64_t BATCH_SIZE, int64_t N_HEADS, int64_t N_ROWS, int64_t BLOCK_SIZE_M,
- int64_t BLOCK_SIZE_N, int64_t NNZ_V, int64_t NNZ_S, bool causal) {
- const int N_THREADS = 64;
- const dim3 dimBlock(N_THREADS);
- const dim3 dimGrid(N_HEADS, BATCH_SIZE, (N_ROWS + N_THREADS - 1) / N_THREADS);
- convert_vertical_slash_indexes_kernel_mergehead<<>>(
- q_seqlens, kv_seqlens, vertical_indexes, slash_indexes,
- per_head_vertical_topkv, per_head_slash_topkv, block_count, block_offset,
- column_count, column_index, N_HEADS, N_ROWS, BLOCK_SIZE_M, BLOCK_SIZE_N,
- NNZ_V, NNZ_S, causal);
-}
-
-/**
- * Implements the Algorithm 4 in paper https://arxiv.org/abs/2407.02490.
- *
- * Like the above convert_vertical_slash_indexes, but with
- * pre-computed vertical and slash counts.
- */
-void convert_vertical_slash_indexes_mergehead(
- torch::Tensor& block_count, // [BATCH, N_HEADS, NUM_ROWS]
- torch::Tensor& block_offset, // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
- torch::Tensor& column_count, // [BATCH, N_HEADS, NUM_ROWS]
- torch::Tensor& column_index, // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
- torch::Tensor q_seqlens, // [BATCH, ]
- torch::Tensor kv_seqlens, // [BATCH, ]
- torch::Tensor vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
- torch::Tensor slash_indexes, // [BATCH, N_HEADS, NNZ_S]
- torch::Tensor vertical_indices_count, // [N_HEADS, ]
- torch::Tensor slash_indices_count, // [N_HEADS, ]
- int64_t context_size, int64_t block_size_M, int64_t block_size_N,
- bool causal) {
- cudaSetDevice(q_seqlens.get_device());
-
- int batch_size = slash_indexes.size(0);
- int num_heads = slash_indexes.size(1);
- int nnz_slash = slash_indexes.size(2);
- int nnz_vertical = vertical_indexes.size(2);
- int num_rows = (context_size + block_size_M - 1) / block_size_M;
-
- convert_vertical_slash_indexes_64x64_mergehead(
- q_seqlens.data_ptr(), kv_seqlens.data_ptr(),
- vertical_indexes.data_ptr(), slash_indexes.data_ptr(),
- vertical_indices_count.data_ptr(),
- slash_indices_count.data_ptr(), block_count.data_ptr(),
- block_offset.data_ptr(), column_count.data_ptr(),
- column_index.data_ptr(), batch_size, num_heads, num_rows,
- block_size_M, block_size_N, nnz_vertical, nnz_slash, causal);
-}
diff --git a/csrc/cache.h b/csrc/cache.h
index 821d5e719a44..a9e74b0dc2df 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -12,7 +12,8 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
void swap_blocks_batch(const torch::Tensor& src_ptrs,
const torch::Tensor& dst_ptrs,
- const torch::Tensor& sizes);
+ const torch::Tensor& sizes,
+ bool is_src_access_order_any);
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
torch::Tensor& key_cache, torch::Tensor& value_cache,
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 6bea5abc3dfb..9130dd2ccae7 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -77,7 +77,8 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
void swap_blocks_batch(const torch::Tensor& src_ptrs,
const torch::Tensor& dst_ptrs,
- const torch::Tensor& sizes) {
+ const torch::Tensor& sizes,
+ bool is_src_access_order_any) {
TORCH_CHECK(src_ptrs.device().is_cpu(), "src_ptrs must be on CPU");
TORCH_CHECK(dst_ptrs.device().is_cpu(), "dst_ptrs must be on CPU");
TORCH_CHECK(sizes.device().is_cpu(), "sizes must be on CPU");
@@ -97,13 +98,13 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
- // Use cuMemcpyBatchAsync (CUDA 12.8+) to submit all copies in a single
- // driver call, amortizing per-copy submission overhead.
- // int64_t and CUdeviceptr/size_t are both 8 bytes on 64-bit platforms,
- // so we reinterpret_cast the tensor data directly to avoid copies.
- static_assert(sizeof(CUdeviceptr) == sizeof(int64_t));
+ // Use cuMemcpyBatchAsync / hipMemcpyBatchAsync to submit all copies in a
+ // single driver call, amortizing per-copy submission overhead. int64_t
+ // and CUdeviceptr/void*/size_t are all 8 bytes on 64-bit platforms, so we
+ // reinterpret_cast the tensor data directly to avoid copies.
static_assert(sizeof(size_t) == sizeof(int64_t));
#if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 12080
+ static_assert(sizeof(CUdeviceptr) == sizeof(int64_t));
// Resolve cuMemcpyBatchAsync at runtime via cuGetProcAddress so that
// binaries compiled with CUDA 12.8+ still work on older drivers, and
// we avoid the CUDA 13.0 header remapping (#define to _v2 signature).
@@ -124,7 +125,12 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
if (batch_fn != nullptr) {
CUmemcpyAttributes attr = {};
- attr.srcAccessOrder = CU_MEMCPY_SRC_ACCESS_ORDER_STREAM;
+ // ANY lets the DMA engine prefetch source bytes out of stream order,
+ // which is only safe when no GPU stream is concurrently writing the
+ // source.
+ attr.srcAccessOrder = is_src_access_order_any
+ ? CU_MEMCPY_SRC_ACCESS_ORDER_ANY
+ : CU_MEMCPY_SRC_ACCESS_ORDER_STREAM;
size_t attrs_idx = 0;
size_t fail_idx = 0;
CUresult result = batch_fn(reinterpret_cast(dst_data),
@@ -134,12 +140,30 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
&fail_idx, static_cast(stream));
TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed at index ",
fail_idx, " with error ", result);
- } else
+ return;
+ }
+#elif defined(USE_ROCM) && defined(HIP_VERSION) && HIP_VERSION >= 70100000
+ // ROCm 7.1+ exposes hipMemcpyBatchAsync. The 7.2.1 implementation early-
+ // returns hipErrorNotSupported whenever numAttrs > 0 (see ROCm/clr @
+ // rocm-7.2.1 hipamd/src/hip_memory.cpp:2819-2822), so call with
+ // numAttrs=0.
+ {
+ hipMemcpyAttributes attr = {};
+ size_t attrs_idx = 0;
+ size_t fail_idx = 0;
+ hipError_t result = hipMemcpyBatchAsync(
+ reinterpret_cast(dst_data), reinterpret_cast(src_data),
+ reinterpret_cast(size_data), static_cast(n), &attr,
+ &attrs_idx, 0, &fail_idx, static_cast(stream));
+ TORCH_CHECK(result == hipSuccess, "hipMemcpyBatchAsync failed at index ",
+ fail_idx, " with error ", result);
+ return;
+ }
#endif
{
- // Fallback for CUDA < 12.8, older drivers, and ROCm:
- // individual async copies.
- // cudaMemcpyDefault lets the driver infer direction from pointer types.
+ // Fallback for CUDA < 12.8, older CUDA drivers, and ROCm < 7.1:
+ // individual async copies. cudaMemcpyDefault lets the driver infer
+ // direction from pointer types.
for (int64_t i = 0; i < n; i++) {
cudaMemcpyAsync(reinterpret_cast(dst_data[i]),
reinterpret_cast(src_data[i]),
@@ -599,6 +623,11 @@ __global__ void cp_gather_indexer_k_quant_cache_kernel(
const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
// Find batch index within a block
__shared__ int batch_idx[BLOCK_Y_SIZE];
+ if (threadIdx.x == 0) {
+ batch_idx[threadIdx.y] = -1;
+ }
+ __syncthreads();
+
for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
iter++) {
int tid = iter * blockDim.x + threadIdx.x;
@@ -611,16 +640,18 @@ __global__ void cp_gather_indexer_k_quant_cache_kernel(
}
}
-#ifndef USE_ROCM
- __syncwarp();
-#endif
+ __syncthreads();
- if (head_idx >= head_dim || token_idx >= num_tokens) {
+ // num_tokens may be an allocation upper bound when Python avoids a D2H sync.
+ // Only tokens covered by the exact device-side cu_seq_lens are valid to
+ // gather.
+ const int batch = batch_idx[threadIdx.y];
+ if (head_idx >= head_dim || token_idx >= num_tokens || batch < 0) {
return;
}
- const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
- const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
- inbatch_seq_idx / cache_block_size];
+ const int inbatch_seq_idx = token_idx - cu_seq_lens[batch];
+ const int block_idx =
+ block_table[batch * num_blocks + inbatch_seq_idx / cache_block_size];
const int64_t src_block_offset = block_idx * block_stride;
const int64_t cache_inblock_offset =
(inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
@@ -1490,6 +1521,9 @@ void concat_mla_q(torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
TORCH_CHECK(ql_nope.stride(2) == 1, "ql_nope must have stride 1 in dim 2");
TORCH_CHECK(q_pe.stride(2) == 1, "q_pe must have stride 1 in dim 2");
TORCH_CHECK(q_out.stride(2) == 1, "q_out must have stride 1 in dim 2");
+ TORCH_CHECK(ql_nope.scalar_type() == at::ScalarType::Half ||
+ ql_nope.scalar_type() == at::ScalarType::BFloat16,
+ "ql_nope must be float16 or bfloat16 dtype");
if (num_tokens == 0) return;
@@ -1501,7 +1535,7 @@ void concat_mla_q(torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
const at::cuda::OptionalCUDAGuard device_guard(device_of(ql_nope));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
- VLLM_DISPATCH_FLOATING_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] {
+ VLLM_DISPATCH_HALF_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] {
vllm::ConcatMLAQKernel<<>>(
q_out.data_ptr(), ql_nope.data_ptr(),
q_pe.data_ptr(), num_tokens, num_heads, q_out.stride(0),
diff --git a/csrc/cache_kernels_fused.cu b/csrc/cache_kernels_fused.cu
index be037b2fdec2..8687ebe1f14c 100644
--- a/csrc/cache_kernels_fused.cu
+++ b/csrc/cache_kernels_fused.cu
@@ -21,28 +21,33 @@ namespace vllm {
// NOTE Be EXTRA careful with raw_kv_scalar_t, for __half and __nv_bfloat16 it's
// using u16 as the backing type.
-template
+template
__global__ void concat_and_cache_mla_rope_fused_kernel(
const int64_t* __restrict__ positions, // [num_tokens]
qk_t* __restrict__ q_pe, // [num_tokens, num_q_heads, rot_dim]
qk_t* __restrict__ k_pe, // [num_tokens, rot_dim]
const qk_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
- const qk_t* __restrict__ rope_cos_sin_cache, // [max_position, 2,
- // rot_dim // 2]
+ const cos_sin_t* __restrict__ rope_cos_sin_cache, // [max_position, 2,
+ // rot_dim // 2]
const int rot_dim, const int64_t q_pe_stride_token,
const int64_t q_pe_stride_head, const int64_t k_pe_stride,
const int64_t kv_c_stride, const int num_q_heads,
cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank +
// rot_dim)]
- const int64_t* __restrict__ kv_cache_slot_mapping, // [num_tokens]
+ const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int block_stride, const int entry_stride, const int kv_lora_rank,
const int block_size, const float* kv_cache_quant_scale) {
// Each thread block is responsible for one token.
const int64_t token_idx = blockIdx.x;
+ const int64_t slot_idx = slot_mapping[token_idx];
+ // NOTE: slot_idx can be -1 if the token is padded
+ if (slot_idx < 0) {
+ return;
+ }
const int64_t pos = positions[token_idx];
- const qk_t* cos_sin_ptr = rope_cos_sin_cache + pos * rot_dim;
+ const cos_sin_t* cos_sin_ptr = rope_cos_sin_cache + pos * rot_dim;
const int embed_dim = rot_dim / 2;
@@ -54,8 +59,8 @@ __global__ void concat_and_cache_mla_rope_fused_kernel(
// NOTE: Would be nice to have interleaved sin/cos so we could just load
// both at the same time.
- qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx);
- qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim);
+ qk_t cos = static_cast(VLLM_LDG(cos_sin_ptr + pair_idx));
+ qk_t sin = static_cast(VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim));
qk_t* q_pe_head_ptr =
q_pe + token_idx * q_pe_stride_token + head_idx * q_pe_stride_head;
@@ -81,21 +86,15 @@ __global__ void concat_and_cache_mla_rope_fused_kernel(
q_pe_head_ptr[pair_idx_y] = y_dst;
}
- const int64_t slot_idx = kv_cache_slot_mapping[token_idx];
const int64_t block_idx = slot_idx / block_size;
const int64_t entry_idx = slot_idx % block_size;
- // NOTE: slot_idx can be -1 if the token is padded
- if (slot_idx < 0) {
- return;
- }
-
// K with 1 HEAD
for (int i = threadIdx.x; i < embed_dim; i += blockDim.x) {
int pair_idx = i;
- qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx);
- qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim);
+ qk_t cos = static_cast(VLLM_LDG(cos_sin_ptr + pair_idx));
+ qk_t sin = static_cast(VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim));
qk_t* k_pe_head_ptr = k_pe + token_idx * k_pe_stride;
@@ -165,36 +164,43 @@ __global__ void concat_and_cache_mla_rope_fused_kernel(
} // namespace vllm
-#define CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED(RAW_KV_T, CACHE_T, KV_DTYPE) \
- do { \
- VLLM_DISPATCH_FLOATING_TYPES(q_pe.scalar_type(), "qk_scalar_type", [&] { \
- using qk_t = scalar_t; \
- if (rope_is_neox) { \
- vllm::concat_and_cache_mla_rope_fused_kernel \
- <<>>( \
- positions.data_ptr(), q_pe.data_ptr(), \
- k_pe.data_ptr(), kv_c.data_ptr(), \
- rope_cos_sin_cache.data_ptr(), rot_dim, \
- q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \
- num_q_heads, reinterpret_cast(kv_cache.data_ptr()), \
- kv_cache_slot_mapping.data_ptr(), block_stride, \
- entry_stride, kv_lora_rank, block_size, \
- kv_cache_quant_scale.data_ptr()); \
- } else { \
- vllm::concat_and_cache_mla_rope_fused_kernel \
- <<>>( \
- positions.data_ptr(), q_pe.data_ptr(), \
- k_pe.data_ptr(), kv_c.data_ptr(), \
- rope_cos_sin_cache.data_ptr(), rot_dim, \
- q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \
- num_q_heads, reinterpret_cast(kv_cache.data_ptr()), \
- kv_cache_slot_mapping.data_ptr(), block_stride, \
- entry_stride, kv_lora_rank, block_size, \
- kv_cache_quant_scale.data_ptr()); \
- } \
- }); \
+#define CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED(RAW_KV_T, CACHE_T, KV_DTYPE) \
+ do { \
+ VLLM_DISPATCH_FLOATING_TYPES(q_pe.scalar_type(), "qk_scalar_type", [&] { \
+ using qk_t = scalar_t; \
+ VLLM_DISPATCH_FLOATING_TYPES( \
+ rope_cos_sin_cache.scalar_type(), "rope_cos_sin_cache_scalar_type", \
+ [&] { \
+ using cos_sin_t = scalar_t; \
+ if (rope_is_neox) { \
+ vllm::concat_and_cache_mla_rope_fused_kernel< \
+ qk_t, cos_sin_t, true, RAW_KV_T, CACHE_T, KV_DTYPE> \
+ <<>>( \
+ positions.data_ptr(), q_pe.data_ptr(), \
+ k_pe.data_ptr(), kv_c.data_ptr(), \
+ rope_cos_sin_cache.data_ptr(), rot_dim, \
+ q_pe_stride_token, q_pe_stride_head, k_pe_stride, \
+ kv_c_stride, num_q_heads, \
+ reinterpret_cast(kv_cache.data_ptr()), \
+ slot_mapping.data_ptr(), block_stride, \
+ entry_stride, kv_lora_rank, block_size, \
+ kv_cache_quant_scale.data_ptr()); \
+ } else { \
+ vllm::concat_and_cache_mla_rope_fused_kernel< \
+ qk_t, cos_sin_t, false, RAW_KV_T, CACHE_T, KV_DTYPE> \
+ <<>>( \
+ positions.data_ptr(), q_pe.data_ptr(), \
+ k_pe.data_ptr(), kv_c.data_ptr(), \
+ rope_cos_sin_cache.data_ptr(), rot_dim, \
+ q_pe_stride_token, q_pe_stride_head, k_pe_stride, \
+ kv_c_stride, num_q_heads, \
+ reinterpret_cast(kv_cache.data_ptr()), \
+ slot_mapping.data_ptr(), block_stride, \
+ entry_stride, kv_lora_rank, block_size, \
+ kv_cache_quant_scale.data_ptr()); \
+ } \
+ }); \
+ }); \
} while (false)
// Executes RoPE on q_pe and k_pe, then writes k_pe and kv_c in the kv cache.
@@ -208,43 +214,52 @@ void concat_and_cache_mla_rope_fused(
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
torch::Tensor& rope_cos_sin_cache, // [max_position, rot_dim]
bool rope_is_neox,
- torch::Tensor&
- kv_cache_slot_mapping, // [num_tokens] or [num_actual_tokens]
+ torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
torch::Tensor&
kv_cache, // [num_blocks, block_size, (kv_lora_rank + rot_dim)]
const std::string& kv_cache_dtype, torch::Tensor& kv_cache_quant_scale) {
- const int64_t num_tokens = q_pe.size(0);
+ // NOTE(woosuk): In vLLM V1, query/key/position.size(0) can be different from
+ // slot_mapping.size(0) because of padding for CUDA graphs.
+ // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+ // both include padding.
+ // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+ // since key includes padding for CUDA graphs, while slot_mapping does not.
+ // In this case, slot_mapping.size(0) represents the actual number of tokens
+ // before padding.
+ // For compatibility with both cases, we use slot_mapping.size(0) as the
+ // number of tokens.
+ int num_tokens = slot_mapping.size(0);
+ int num_padded_tokens = q_pe.size(0);
+ TORCH_CHECK_GE(num_padded_tokens, num_tokens);
const int num_q_heads = q_pe.size(1);
const int rot_dim = q_pe.size(2);
const int kv_lora_rank = kv_c.size(1);
- TORCH_CHECK(positions.size(0) >=
- num_tokens); // CUDA Graphs might pad this for us
+ TORCH_CHECK_EQ(positions.size(0), num_padded_tokens);
TORCH_CHECK_EQ(positions.dim(), 1);
TORCH_CHECK_EQ(positions.scalar_type(), c10::ScalarType::Long);
- TORCH_CHECK_EQ(q_pe.size(0), num_tokens);
+ TORCH_CHECK_EQ(q_pe.dim(), 3);
+ TORCH_CHECK_EQ(q_pe.size(0), num_padded_tokens);
TORCH_CHECK_EQ(q_pe.size(1), num_q_heads);
TORCH_CHECK_EQ(q_pe.size(2), rot_dim);
- TORCH_CHECK_EQ(q_pe.dim(), 3);
- TORCH_CHECK_EQ(k_pe.size(0), num_tokens);
- TORCH_CHECK_EQ(k_pe.size(1), rot_dim);
TORCH_CHECK_EQ(k_pe.dim(), 2);
+ TORCH_CHECK_EQ(k_pe.size(0), num_padded_tokens);
+ TORCH_CHECK_EQ(k_pe.size(1), rot_dim);
TORCH_CHECK_EQ(k_pe.scalar_type(), q_pe.scalar_type());
- TORCH_CHECK_EQ(kv_c.size(0), num_tokens);
- TORCH_CHECK_EQ(kv_c.size(1), kv_lora_rank);
TORCH_CHECK_EQ(kv_c.dim(), 2);
+ TORCH_CHECK_EQ(kv_c.size(0), num_padded_tokens);
+ TORCH_CHECK_EQ(kv_c.size(1), kv_lora_rank);
TORCH_CHECK_EQ(kv_c.scalar_type(), q_pe.scalar_type());
TORCH_CHECK_EQ(kv_c.dtype(), q_pe.dtype());
TORCH_CHECK_EQ(rope_cos_sin_cache.size(1), rot_dim);
- TORCH_CHECK_EQ(rope_cos_sin_cache.scalar_type(), q_pe.scalar_type());
- TORCH_CHECK_EQ(kv_cache_slot_mapping.size(0), num_tokens);
- TORCH_CHECK_EQ(kv_cache_slot_mapping.scalar_type(), c10::ScalarType::Long);
+ TORCH_CHECK_EQ(slot_mapping.size(0), num_tokens);
+ TORCH_CHECK_EQ(slot_mapping.scalar_type(), c10::ScalarType::Long);
TORCH_CHECK_EQ(kv_cache.size(2), kv_lora_rank + rot_dim);
TORCH_CHECK_EQ(kv_cache.dim(), 3);
diff --git a/csrc/core/batch_invariant.hpp b/csrc/core/batch_invariant.hpp
index fffe96b86857..8273bc74b1ef 100644
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@@ -1,7 +1,6 @@
#pragma once
#include
#include
-#include
namespace vllm {
diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
index 68a8750f583b..b6f39ed795f3 100644
--- a/csrc/core/scalar_type.hpp
+++ b/csrc/core/scalar_type.hpp
@@ -1,7 +1,13 @@
#pragma once
-// For TORCH_CHECK
-#include
+#include
+#include
+#include
+#include
+#include
+
+// For STD_TORCH_CHECK
+#include
namespace vllm {
@@ -45,7 +51,7 @@ class ScalarType {
// IEEE 754 compliant floating point type
static constexpr ScalarType float_IEEE754(uint8_t exponent,
uint8_t mantissa) {
- TORCH_CHECK(mantissa > 0 && exponent > 0);
+ STD_TORCH_CHECK(mantissa > 0 && exponent > 0);
return ScalarType(exponent, mantissa, true, 0, false, NAN_IEEE_754);
}
@@ -53,11 +59,12 @@ class ScalarType {
static constexpr ScalarType float_(uint8_t exponent, uint8_t mantissa,
bool finite_values_only,
NanRepr nan_repr) {
- TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
- TORCH_CHECK(mantissa > 0 && exponent > 0);
- TORCH_CHECK(nan_repr != NAN_IEEE_754,
- "use `float_IEEE754` constructor for floating point types that "
- "follow IEEE 754 conventions");
+ STD_TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
+ STD_TORCH_CHECK(mantissa > 0 && exponent > 0);
+ STD_TORCH_CHECK(
+ nan_repr != NAN_IEEE_754,
+ "use `float_IEEE754` constructor for floating point types that "
+ "follow IEEE 754 conventions");
return ScalarType(exponent, mantissa, true, 0, finite_values_only,
nan_repr);
}
@@ -176,8 +183,8 @@ class ScalarType {
private:
double _floating_point_max() const {
- TORCH_CHECK(mantissa <= 52 && exponent <= 11,
- "Cannot represent max/min as a double for type ", str());
+ STD_TORCH_CHECK(mantissa <= 52 && exponent <= 11,
+ "Cannot represent max/min as a double for type ", str());
uint64_t max_mantissa = (uint64_t(1) << mantissa) - 1;
if (nan_repr == NAN_EXTD_RANGE_MAX_MIN) {
@@ -186,8 +193,8 @@ class ScalarType {
uint64_t max_exponent = (uint64_t(1) << exponent) - 2;
if (nan_repr == NAN_EXTD_RANGE_MAX_MIN || nan_repr == NAN_NONE) {
- TORCH_CHECK(exponent < 11,
- "Cannot represent max/min as a double for type ", str());
+ STD_TORCH_CHECK(exponent < 11,
+ "Cannot represent max/min as a double for type ", str());
max_exponent += 1;
}
@@ -216,16 +223,17 @@ class ScalarType {
if (is_floating_point()) {
return {_floating_point_max()};
} else {
- TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(),
- "Cannot represent max as a int64_t");
+ STD_TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(),
+ "Cannot represent max as a int64_t");
return {(int64_t(1) << mantissa) - 1};
}
}
constexpr std::variant _raw_min() const {
if (is_floating_point()) {
- TORCH_CHECK(is_signed(),
- "We currently assume all floating point types are signed");
+ STD_TORCH_CHECK(
+ is_signed(),
+ "We currently assume all floating point types are signed");
constexpr uint64_t sign_bit_double = (uint64_t(1) << 63);
double max = _floating_point_max();
@@ -233,8 +241,8 @@ class ScalarType {
uint64_t min_raw = max_raw | sign_bit_double;
return {*reinterpret_cast(&min_raw)};
} else {
- TORCH_CHECK(!is_signed() || size_bits() <= 64,
- "Cannot represent min as a int64_t");
+ STD_TORCH_CHECK(!is_signed() || size_bits() <= 64,
+ "Cannot represent min as a int64_t");
if (is_signed()) {
// set the top bit to 1 (i.e. INT64_MIN) and the rest to 0
// then perform an arithmetic shift right to set all the bits above
diff --git a/csrc/cpu/cpu_arch_macros.h b/csrc/cpu/cpu_arch_macros.h
index c73b62ecdec9..53ae70497c0f 100644
--- a/csrc/cpu/cpu_arch_macros.h
+++ b/csrc/cpu/cpu_arch_macros.h
@@ -61,8 +61,23 @@
#endif
#ifdef __aarch64__
- // Implementation copied from Arm Optimized Routines (expf AdvSIMD)
+ // Implementation of neon_expf copied from Arm Optimized Routines (expf
+ // AdvSIMD)
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
+ //
+ // Additional fast exponential intended for cases where outputs will be
+ // downcasted to FP16 / BF16 (e.g. attention softmax). Accurate within 1 ULP
+ // for FP16 Accurate within 1 ULP for BF16 for inputs in [-87.683, 88.376] &
+ // clamps inputs outside this range to 0 / inf. Implementation is similar to
+ // exp_u20, but:
+ // - uses a third degree polynomial approximation for exp(r) instead of a
+ // fifth degree one, with coefficients re-tuned.
+ // - does not split natural log (ln) into high / low parts
+ // - clamps exp(x) to 0 for x < -87.683113f and inf for x > 88.3762589f
+ // exp(x) = 2^n (exp(r))
+ // r = x - n*ln2, with n = round(x/ln2)
+ // exp(r) ~ poly(r) = 1 + r + r^2 * (c3 + c2 * r)
+ // n = round(x / ln2), r = x - n*ln2
#include
#define DEFINE_FAST_EXP \
const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f); \
@@ -106,8 +121,55 @@
result.val[2] = neon_expf(vec.reg.val[2]); \
result.val[3] = neon_expf(vec.reg.val[3]); \
return vec_op::FP32Vec16(result); \
- };
+ }; \
+ const float32x4_t lower_bound = vdupq_n_f32(-0x1.5ebb82p+6f); \
+ const float32x4_t upper_bound = vdupq_n_f32(0x1.61814ap+6f); \
+ constexpr float ln2 = 0x1.62e43p-1f; \
+ constexpr float f_c2 = 0x1.5592ecp-3f; \
+ const float32x4_t f_c3 = vdupq_n_f32(0x1.017d34p-1f); \
+ auto neon_expf_f16 = [&](float32x4_t values) __attribute__(( \
+ always_inline)) { \
+ const uint32x4_t lt_lower = vcltq_f32(values, lower_bound); \
+ const uint32x4_t gt_upper = vcgtq_f32(values, upper_bound); \
+ float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2)); \
+ float32x4_t r = vfmsq_n_f32(values, n, ln2); \
+ uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23); \
+ float32x4_t r2 = vmulq_f32(r, r); \
+ float32x4_t q = vfmaq_n_f32(f_c3, r, f_c2); \
+ float32x4_t s = vaddq_f32(vdupq_n_f32(1.0f), r); \
+ float32x4_t p = vfmaq_f32(s, q, r2); \
+ float32x4_t y = \
+ vreinterpretq_f32_u32(vaddq_u32(vreinterpretq_u32_f32(p), e)); \
+ y = vbslq_f32(lt_lower, vdupq_n_f32(0.0f), y); \
+ y = vbslq_f32(gt_upper, vdupq_n_f32(INFINITY), y); \
+ return y; \
+ }; \
+ auto fast_exp_f16 = [&](const vec_op::FP32Vec16& vec) \
+ __attribute__((always_inline)) { \
+ float32x4x4_t result; \
+ result.val[0] = neon_expf_f16(vec.reg.val[0]); \
+ result.val[1] = neon_expf_f16(vec.reg.val[1]); \
+ result.val[2] = neon_expf_f16(vec.reg.val[2]); \
+ result.val[3] = neon_expf_f16(vec.reg.val[3]); \
+ return vec_op::FP32Vec16(result); \
+ };
#endif // __aarch64__
+// RISC-V RVV
+#ifdef __riscv_v
+ #include
+
+ #ifdef __riscv_zihintpause
+ #define FAST_SPINNING __riscv_pause();
+ #endif
+
+ // FP32Vec16::exp() in cpu_types_riscv.hpp already implements the full
+ // polynomial approximation for RVV, so we simply delegate to it.
+ #define DEFINE_FAST_EXP \
+ auto fast_exp = [&](const vec_op::FP32Vec16& vec) \
+ __attribute__((always_inline)) { return vec.exp(); };
+
+#endif // __riscv_v
+
#endif
diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
index a582b4b4d7cc..26b881f4f143 100644
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -1,5 +1,16 @@
#include "cpu_attn_dispatch_generated.h"
+// Maps kv_cache_dtype string to Fp8KVCacheDataType enum.
+// "auto" -> kAuto(0); "fp8"/"fp8_e4m3" -> kFp8E4M3; "fp8_e5m2" -> kFp8E5M2.
+static inline cpu_attention::Fp8KVCacheDataType parse_fp8_kv_dtype(
+ const std::string& kv_cache_dtype) {
+ if (kv_cache_dtype == "fp8_e5m2")
+ return cpu_attention::Fp8KVCacheDataType::kFp8E5M2;
+ if (kv_cache_dtype == "fp8_e4m3" || kv_cache_dtype == "fp8")
+ return cpu_attention::Fp8KVCacheDataType::kFp8E4M3;
+ return cpu_attention::Fp8KVCacheDataType::kAuto;
+}
+
torch::Tensor get_scheduler_metadata(
const int64_t num_req, const int64_t num_heads_q,
const int64_t num_heads_kv, const int64_t head_dim,
@@ -18,6 +29,10 @@ torch::Tensor get_scheduler_metadata(
isa = cpu_attention::ISA::NEON;
} else if (isa_hint == "vxe") {
isa = cpu_attention::ISA::VXE;
+ } else if (isa_hint == "rvv") {
+ isa = cpu_attention::ISA::RVV;
+ } else if (isa_hint == "vsx") {
+ isa = cpu_attention::ISA::VSX;
} else {
TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
}
@@ -49,7 +64,7 @@ torch::Tensor get_scheduler_metadata(
input.enable_kv_split = enable_kv_split;
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
- CPU_ATTN_DISPATCH(head_dim, isa, [&]() {
+ CPU_ATTN_DISPATCH(head_dim, isa, 0, [&]() {
input.elem_size = sizeof(scalar_t);
input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
@@ -72,7 +87,9 @@ void cpu_attn_reshape_and_cache(
key_cache, // [num_blocks, num_kv_heads, block_size, head_size]
torch::Tensor&
value_cache, // [num_blocks, num_kv_heads, block_size, head_size]
- const torch::Tensor& slot_mapping, const std::string& isa) {
+ const torch::Tensor& slot_mapping, const std::string& isa,
+ const double k_scale = 1.0, const double v_scale = 1.0,
+ const std::string& kv_cache_dtype = "auto") {
TORCH_CHECK_EQ(key.dim(), 3);
TORCH_CHECK_EQ(value.dim(), 3);
TORCH_CHECK_EQ(key_cache.dim(), 4);
@@ -80,18 +97,30 @@ void cpu_attn_reshape_and_cache(
TORCH_CHECK_EQ(key.stride(2), 1);
TORCH_CHECK_EQ(value.stride(2), 1);
+ const int64_t kv_cache_idx =
+ static_cast(parse_fp8_kv_dtype(kv_cache_dtype));
+ const bool is_fp8 = (kv_cache_idx != 0);
+
+ if (is_fp8) {
+ TORCH_CHECK(key_cache.scalar_type() == at::ScalarType::Byte,
+ "key_cache must be uint8 for FP8 path");
+ TORCH_CHECK(value_cache.scalar_type() == at::ScalarType::Byte,
+ "value_cache must be uint8 for FP8 path");
+ TORCH_CHECK(k_scale > 0, "k_scale must be positive for FP8 path");
+ TORCH_CHECK(v_scale > 0, "v_scale must be positive for FP8 path");
+ }
+
+ const float k_inv = is_fp8 ? 1.0f / static_cast(k_scale) : 0.0f;
+ const float v_inv = is_fp8 ? 1.0f / static_cast(v_scale) : 0.0f;
+
const int64_t token_num = key.size(0);
- const int64_t key_token_num_stride = key.stride(0);
- const int64_t value_token_num_stride = value.stride(0);
- const int64_t head_num = value.size(1);
- const int64_t key_head_num_stride = key.stride(1);
- const int64_t value_head_num_stride = value.stride(1);
+ const int64_t head_num = key.size(1);
+ const int64_t head_dim = key.size(2);
const int64_t num_blocks = key_cache.size(0);
const int64_t num_blocks_stride = key_cache.stride(0);
const int64_t cache_head_num_stride = key_cache.stride(1);
const int64_t block_size = key_cache.size(2);
const int64_t block_size_stride = key_cache.stride(2);
- const int64_t head_dim = key.size(-1);
cpu_attention::ISA isa_tag = [&]() {
if (isa == "amx") {
@@ -104,21 +133,33 @@ void cpu_attn_reshape_and_cache(
return cpu_attention::ISA::NEON;
} else if (isa == "vxe") {
return cpu_attention::ISA::VXE;
+ } else if (isa == "rvv") {
+ return cpu_attention::ISA::RVV;
+ } else if (isa == "vsx") {
+ return cpu_attention::ISA::VSX;
} else {
TORCH_CHECK(false, "Invalid ISA type: " + isa);
}
}();
+ if (is_fp8) {
+ TORCH_CHECK(isa_tag == cpu_attention::ISA::AMX ||
+ isa_tag == cpu_attention::ISA::VEC,
+ "FP8 KV cache is only supported on x86 (AMX/VEC) ISA");
+ }
+
VLLM_DISPATCH_FLOATING_TYPES(
key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
- CPU_ATTN_DISPATCH(head_dim, isa_tag, [&]() {
+ CPU_ATTN_DISPATCH(head_dim, isa_tag, kv_cache_idx, [&]() {
+ using kv_t = typename attn_impl::kv_cache_t;
attn_impl::reshape_and_cache(
key.data_ptr(), value.data_ptr(),
- key_cache.data_ptr(), value_cache.data_ptr(),
- slot_mapping.data_ptr(), token_num, key_token_num_stride,
- value_token_num_stride, head_num, key_head_num_stride,
- value_head_num_stride, num_blocks, num_blocks_stride,
- cache_head_num_stride, block_size, block_size_stride);
+ reinterpret_cast(key_cache.data_ptr()),
+ reinterpret_cast(value_cache.data_ptr()),
+ slot_mapping.data_ptr(), token_num, key.stride(0),
+ value.stride(0), head_num, key.stride(1), value.stride(1),
+ num_blocks, num_blocks_stride, cache_head_num_stride, block_size,
+ block_size_stride, k_inv, v_inv);
});
});
}
@@ -137,13 +178,26 @@ void cpu_attention_with_kv_cache(
const int64_t sliding_window_left, const int64_t sliding_window_right,
const torch::Tensor& block_table, // [num_tokens, max_block_num]
const double softcap, const torch::Tensor& scheduler_metadata,
- const std::optional& s_aux // [num_heads]
-) {
+ const std::optional& s_aux, // [num_heads]
+ const double k_scale = 1.0, const double v_scale = 1.0,
+ const std::string& kv_cache_dtype = "auto") {
TORCH_CHECK_EQ(query.dim(), 3);
TORCH_CHECK_EQ(query.stride(2), 1);
TORCH_CHECK_EQ(key_cache.dim(), 4);
TORCH_CHECK_EQ(value_cache.dim(), 4);
+ const int64_t kv_cache_idx =
+ static_cast(parse_fp8_kv_dtype(kv_cache_dtype));
+ const bool is_fp8 = (kv_cache_idx != 0);
+ if (is_fp8) {
+ TORCH_CHECK(key_cache.scalar_type() == at::ScalarType::Byte,
+ "key_cache must be uint8 for FP8 path");
+ TORCH_CHECK(value_cache.scalar_type() == at::ScalarType::Byte,
+ "value_cache must be uint8 for FP8 path");
+ TORCH_CHECK(k_scale > 0, "k_scale must be positive for FP8 path");
+ TORCH_CHECK(v_scale > 0, "v_scale must be positive for FP8 path");
+ }
+
cpu_attention::AttentionInput input;
input.metadata = reinterpret_cast(
scheduler_metadata.data_ptr());
@@ -165,25 +219,32 @@ void cpu_attention_with_kv_cache(
input.block_table = block_table.data_ptr();
input.alibi_slopes =
alibi_slopes.has_value() ? alibi_slopes->data_ptr() : nullptr;
- // For now sink must be bf16
input.s_aux = s_aux.has_value() ? s_aux->data_ptr() : nullptr;
input.scale = scale;
input.causal = causal;
input.sliding_window_left = sliding_window_left;
input.sliding_window_right = sliding_window_right;
if (input.causal) {
- // to make boundary calculation easier
input.sliding_window_right = 0;
}
- float softcap_fp32 = softcap;
- input.softcap = softcap_fp32;
+ input.softcap = static_cast(softcap);
+
+ if (is_fp8) {
+ input.k_scale_fp8 = static_cast(k_scale);
+ input.v_scale_fp8 = static_cast(v_scale);
+ TORCH_CHECK(input.metadata->isa == cpu_attention::ISA::AMX ||
+ input.metadata->isa == cpu_attention::ISA::VEC,
+ "FP8 KV cache is only supported on x86 (AMX/VEC) ISA");
+ }
VLLM_DISPATCH_FLOATING_TYPES(
query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
- CPU_ATTN_DISPATCH(query.size(2), input.metadata->isa, [&]() {
- TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
- cpu_attention::AttentionMainLoop mainloop;
- mainloop(&input);
- });
+ CPU_ATTN_DISPATCH(
+ query.size(2), input.metadata->isa, kv_cache_idx, [&]() {
+ TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment,
+ 0);
+ cpu_attention::AttentionMainLoop mainloop;
+ mainloop(&input);
+ });
});
}
diff --git a/csrc/cpu/cpu_attn_amx.hpp b/csrc/cpu/cpu_attn_amx.hpp
index 1c8644d52329..6a0341085dce 100644
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -1,6 +1,7 @@
#ifndef CPU_ATTN_AMX_HPP
#define CPU_ATTN_AMX_HPP
+#include "cpu_attn_fp8.hpp"
#include "cpu_attn_impl.hpp"
namespace cpu_attention {
@@ -21,9 +22,10 @@ typedef struct __tile_config {
// 2-2-4 pattern, for 16 < m <= 32
// TILE 0, 1: load A matrix, row num should be 16, m - 16
// TILE 2, 3: load B matrix, row num should be 16
-// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
-// - 16
-template
+// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16,
+// m - 16, m - 16
+// q_buffer_t: A (Q/P) tile type; kv_cache_t: B (K/V cache) tile type.
+template
class TileGemm224 {
public:
template
@@ -42,13 +44,56 @@ class TileGemm224 {
}
};
-template <>
-class TileGemm224 {
+// Dequantize one FP8 tile (AMX_TILE_ROW_NUM rows x 32 cols) to BF16.
+template
+FORCE_INLINE void deq_tile_amx(const uint8_t* src, c10::BFloat16* dst) {
+ for (int r = 0; r < AMX_TILE_ROW_NUM; ++r) {
+ if constexpr (std::is_same_v) {
+ vec_op::BF16Vec32(src + r * 32, vec_op::fp8_bf16_e4m3_tag{})
+ .save(dst + r * 32);
+ } else {
+ vec_op::BF16Vec32(src + r * 32, vec_op::fp8_bf16_e5m2_tag{})
+ .save(dst + r * 32);
+ }
+ }
+}
+
+// For FP8: dequant src into scratch and return scratch.
+// For BF16: return src directly (scratch is unused; the compiler elides it).
+template
+FORCE_INLINE const c10::BFloat16* prepare_b_tile(const kv_cache_t* src,
+ c10::BFloat16* scratch) {
+ if constexpr (std::is_same_v ||
+ std::is_same_v) {
+ deq_tile_amx(reinterpret_cast(src), scratch);
+ return scratch;
+ } else {
+ return reinterpret_cast(src);
+ }
+}
+
+// Handles both BF16 and FP8 KV cache (2-2-4 pattern).
+template
+class TileGemm224 {
+ static_assert(std::is_same_v ||
+ std::is_same_v ||
+ std::is_same_v,
+ "kv_cache_t must be BFloat16, Float8_e4m3fn, or Float8_e5m2");
+
+ static constexpr bool fp8_kv =
+ std::is_same_v ||
+ std::is_same_v;
+
+ static constexpr int64_t tile_elems = AMX_TILE_BYTES / sizeof(c10::BFloat16);
+ // BF16 path: scratch_elems=1 so the scratch array is eliminated by the
+ // compiler.
+ static constexpr int64_t scratch_elems = fp8_kv ? tile_elems : 1;
+
public:
template
FORCE_INLINE static void gemm(const int32_t m_size,
c10::BFloat16* __restrict__ a_tile,
- c10::BFloat16* __restrict__ b_tile,
+ kv_cache_t* __restrict__ b_tile,
float* __restrict__ c_tile, const int64_t lda,
const int64_t ldb, const int64_t ldc,
const int32_t block_size,
@@ -56,6 +101,7 @@ class TileGemm224 {
const bool accum_c) {
const int32_t k_times =
dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+
c10::BFloat16* __restrict__ a_tile_0 = a_tile;
c10::BFloat16* __restrict__ a_tile_1 = a_tile + lda * AMX_TILE_ROW_NUM;
const int64_t a_tile_stride = [&]() {
@@ -70,8 +116,8 @@ class TileGemm224 {
}
}();
- c10::BFloat16* __restrict__ b_tile_2 = b_tile;
- c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+ kv_cache_t* __restrict__ b_tile_2 = b_tile;
+ kv_cache_t* __restrict__ b_tile_3 = [&]() {
if constexpr (phase == AttentionGemmPhase::QK) {
// k_cache is prepacked
return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
@@ -106,11 +152,16 @@ class TileGemm224 {
_tile_zero(7);
}
+ alignas(64) c10::BFloat16 scratch_2[scratch_elems];
+ alignas(64) c10::BFloat16 scratch_3[scratch_elems];
for (int32_t k = 0; k < k_times; ++k) {
+ const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2);
+ const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3);
+
_tile_loadd(0, a_tile_0, a_tile_stride);
- _tile_stream_loadd(2, b_tile_2, b_tile_stride);
+ _tile_stream_loadd(2, const_cast(load_2), b_tile_stride);
_tile_dpbf16ps(4, 0, 2);
- _tile_stream_loadd(3, b_tile_3, b_tile_stride);
+ _tile_stream_loadd(3, const_cast(load_3), b_tile_stride);
_tile_dpbf16ps(5, 0, 3);
_tile_loadd(1, a_tile_1, a_tile_stride);
_tile_dpbf16ps(6, 1, 2);
@@ -154,13 +205,13 @@ class TileGemm224 {
};
// 1-2-2 pattern, for 0 < m <= 16
-// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
-// m, m
-// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
-// num should be 16
-// TILE 6, 7, (6, 7): store results C matrix, row num should be
-// m
-template
+// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should
+// be m, m
+// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row num
+// should be 16
+// TILE 6, 7: store results C matrix, row num should be m
+// q_buffer_t: A (Q/P) tile type; kv_cache_t: B (K/V cache) tile type.
+template
class TileGemm122 {
public:
template
@@ -179,13 +230,26 @@ class TileGemm122 {
}
};
-template <>
-class TileGemm122 {
+// Handles both BF16 and FP8 KV cache (1-2-2 pattern).
+template
+class TileGemm122 {
+ static_assert(std::is_same_v ||
+ std::is_same_v ||
+ std::is_same_v,
+ "kv_cache_t must be BFloat16, Float8_e4m3fn, or Float8_e5m2");
+
+ static constexpr bool fp8_kv =
+ std::is_same_v ||
+ std::is_same_v;
+
+ static constexpr int64_t tile_elems = AMX_TILE_BYTES / sizeof(c10::BFloat16);
+ static constexpr int64_t scratch_elems = fp8_kv ? tile_elems : 1;
+
public:
template
FORCE_INLINE static void gemm(const int32_t m_size,
c10::BFloat16* __restrict__ a_tile,
- c10::BFloat16* __restrict__ b_tile,
+ kv_cache_t* __restrict__ b_tile,
float* __restrict__ c_tile, const int64_t lda,
const int64_t ldb, const int64_t ldc,
const int32_t block_size,
@@ -215,21 +279,19 @@ class TileGemm122 {
}
}();
- c10::BFloat16* __restrict__ b_tile_2 = b_tile;
- c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+ kv_cache_t* __restrict__ b_tile_2 = b_tile;
+ kv_cache_t* __restrict__ b_tile_3 = [&]() {
if constexpr (phase == AttentionGemmPhase::QK) {
- // k_cache is prepacked
return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
} else if constexpr (phase == AttentionGemmPhase::PV) {
- // v_cache is prepacked
return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
} else {
TORCH_CHECK(false, "Unreachable");
}
}();
- c10::BFloat16* __restrict__ b_tile_4 =
+ kv_cache_t* __restrict__ b_tile_4 =
b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
- c10::BFloat16* __restrict__ b_tile_5 =
+ kv_cache_t* __restrict__ b_tile_5 =
b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
int64_t b_stride = AMX_TILE_ROW_BYTES;
@@ -250,16 +312,25 @@ class TileGemm122 {
_tile_zero(7);
}
+ alignas(64) c10::BFloat16 scratch_2[scratch_elems];
+ alignas(64) c10::BFloat16 scratch_3[scratch_elems];
+ alignas(64) c10::BFloat16 scratch_4[scratch_elems];
+ alignas(64) c10::BFloat16 scratch_5[scratch_elems];
for (int32_t k = 0; k < k_group_times; ++k) {
+ const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2);
+ const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3);
+ const c10::BFloat16* load_4 = prepare_b_tile(b_tile_4, scratch_4);
+ const c10::BFloat16* load_5 = prepare_b_tile(b_tile_5, scratch_5);
+
_tile_loadd(0, a_tile_0, a_tile_stride);
- _tile_stream_loadd(2, b_tile_2, b_stride);
+ _tile_stream_loadd(2, const_cast(load_2), b_stride);
_tile_dpbf16ps(6, 0, 2);
- _tile_stream_loadd(3, b_tile_3, b_stride);
+ _tile_stream_loadd(3, const_cast(load_3), b_stride);
_tile_dpbf16ps(7, 0, 3);
_tile_loadd(1, a_tile_1, a_tile_stride);
- _tile_stream_loadd(4, b_tile_4, b_stride);
+ _tile_stream_loadd(4, const_cast(load_4), b_stride);
_tile_dpbf16ps(6, 1, 4);
- _tile_stream_loadd(5, b_tile_5, b_stride);
+ _tile_stream_loadd(5, const_cast(load_5), b_stride);
_tile_dpbf16ps(7, 1, 5);
// update ptrs
@@ -279,10 +350,13 @@ class TileGemm122 {
}
if (has_tail) {
+ const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2);
+ const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3);
+
_tile_loadd(0, a_tile_0, a_tile_stride);
- _tile_stream_loadd(2, b_tile_2, b_stride);
+ _tile_stream_loadd(2, const_cast(load_2), b_stride);
_tile_dpbf16ps(6, 0, 2);
- _tile_stream_loadd(3, b_tile_3, b_stride);
+ _tile_stream_loadd(3, const_cast(load_3), b_stride);
_tile_dpbf16ps(7, 0, 3);
}
@@ -302,21 +376,25 @@ class TileGemm122 {
_tile_loadconfig(&config);
}
};
+
} // namespace
-template
-class AttentionImpl {
+template
+class AttentionImpl {
+ static constexpr bool fp8_kv =
+ std::is_same_v ||
+ std::is_same_v;
+
public:
using query_t = scalar_t;
using q_buffer_t = scalar_t;
- using kv_cache_t = scalar_t;
+ using kv_cache_t = kv_cache_scalar_t;
using logits_buffer_t = float;
using partial_output_buffer_t = float;
using prob_buffer_t = scalar_t;
constexpr static int64_t BlockSizeAlignment =
- AMX_TILE_ROW_BYTES /
- sizeof(kv_cache_t); // KV token num unit of QK and PV phases
+ 32; // AMX_TILE_ROW_NUM = 16 tokens/tile; 32 = 2 tiles
constexpr static int64_t HeadDimAlignment =
2 * (AMX_TILE_ROW_BYTES / 4); // headdim num unit of PV phase
constexpr static int64_t MaxQHeadNumPerIteration = 32;
@@ -324,6 +402,9 @@ class AttentionImpl {
constexpr static ISA ISAType = ISA::AMX;
constexpr static bool scale_on_logits = true;
+ float k_scale = 1.0f;
+ float v_scale = 1.0f;
+
public:
AttentionImpl() : current_q_head_num_(0) {
// Use all columns in AMX tiles
@@ -332,21 +413,50 @@ class AttentionImpl {
~AttentionImpl() { _tile_release(); }
+ void init_from_input(const AttentionInput* input) {
+ if constexpr (fp8_kv) {
+ k_scale = input->k_scale_fp8;
+ v_scale = input->v_scale_fp8;
+ }
+ }
+
+ float get_output_v_scale() const noexcept {
+ if constexpr (fp8_kv) {
+ // AMX dequant places FP8 payload into a BF16 field (exponent bias 127).
+ // Correction = 2^(127 - FP8_bias): E4M3 bias=7 → 2^120, E5M2 bias=15 →
+ // 2^112.
+ constexpr float bias =
+ std::is_same_v ? 0x1p112f : 0x1p120f;
+ return v_scale * bias;
+ }
+ return 1.0f;
+ }
+
template typename attention>
FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+ if constexpr (fp8_kv) {
+ // Same bias correction as get_output_v_scale: AMX FP8→BF16 dequant
+ // shifts the exponent bias from FP8 to BF16 (127), so we multiply by
+ // 2^(127-FP8_bias) to recover the true value. E4M3: 2^120, E5M2: 2^112.
+ const float bias =
+ std::is_same_v ? 0x1p112f : 0x1p120f;
+ scale *= k_scale * bias;
+ }
if (q_head_num > AMX_TILE_ROW_NUM) {
if (q_head_num != current_q_head_num_) {
current_q_head_num_ = q_head_num;
- TileGemm224::init_tile_config(q_head_num, amx_tile_config_);
+ TileGemm224::init_tile_config(q_head_num,
+ amx_tile_config_);
}
- attention> attention_iteration;
+ attention> attention_iteration;
attention_iteration(CPU_ATTENTION_PARAMS);
} else {
if (q_head_num != current_q_head_num_) {
current_q_head_num_ = q_head_num;
- TileGemm122::init_tile_config(q_head_num, amx_tile_config_);
+ TileGemm122::init_tile_config(q_head_num,
+ amx_tile_config_);
}
- attention> attention_iteration;
+ attention> attention_iteration;
attention_iteration(CPU_ATTENTION_PARAMS);
}
}
@@ -411,13 +521,26 @@ class AttentionImpl {
// reshape KV to AMX friendly layout
static void reshape_and_cache(
const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
- scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+ kv_cache_t* __restrict__ key_cache, kv_cache_t* __restrict__ value_cache,
const int64_t* __restrict__ slot_mapping, const int64_t token_num,
const int64_t key_token_num_stride, const int64_t value_token_num_stride,
const int64_t head_num, const int64_t key_head_num_stride,
const int64_t value_head_num_stride, const int64_t num_blocks,
const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
- const int64_t block_size, const int64_t block_size_stride) {
+ const int64_t block_size, const int64_t block_size_stride,
+ const float k_inv = 0.0f, const float v_inv = 0.0f) {
+ if constexpr (fp8_kv) {
+ constexpr auto qfn = select_fp8_quant_fn();
+ reshape_and_cache_fp8_amx_impl(
+ key, value, reinterpret_cast(key_cache),
+ reinterpret_cast(value_cache), slot_mapping, token_num,
+ head_num, head_dim, block_size, key_token_num_stride,
+ key_head_num_stride, value_token_num_stride, value_head_num_stride,
+ num_blocks_stride, cache_head_num_stride, num_blocks_stride,
+ cache_head_num_stride, k_inv, v_inv);
+ return;
+ }
+
// For AMX 2D tiles, size of each line is 64 bytes
constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
// For AMX B matrix, N always is 16
@@ -426,6 +549,9 @@ class AttentionImpl {
// For now suppose block_size is divisible by amx_tile_column_num
TORCH_CHECK_EQ(block_size % amx_b_tile_k_size, 0);
+ scalar_t* __restrict__ kc = reinterpret_cast(key_cache);
+ scalar_t* __restrict__ vc = reinterpret_cast(value_cache);
+
#pragma omp parallel for collapse(2)
for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
@@ -453,8 +579,7 @@ class AttentionImpl {
constexpr int64_t quadword_num_per_group =
token_num_per_group * quadword_num;
int32_t* key_cache_start_ptr =
- reinterpret_cast(key_cache +
- block_idx * num_blocks_stride +
+ reinterpret_cast(kc + block_idx * num_blocks_stride +
head_idx * cache_head_num_stride) +
group_idx * quadword_num_per_group + group_offset;
@@ -483,7 +608,7 @@ class AttentionImpl {
token_idx * value_token_num_stride +
head_idx * value_head_num_stride;
scalar_t* value_cache_start_ptr =
- value_cache + block_idx * num_blocks_stride +
+ vc + block_idx * num_blocks_stride +
head_idx * cache_head_num_stride +
sub_group_idx * token_num_per_sub_group * amx_b_tile_n_size +
sub_group_offset;
diff --git a/csrc/cpu/cpu_attn_fp8.hpp b/csrc/cpu/cpu_attn_fp8.hpp
new file mode 100644
index 000000000000..764b6ed7f84a
--- /dev/null
+++ b/csrc/cpu/cpu_attn_fp8.hpp
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#pragma once
+#include
+#include
+#include
+#include
+#include
+
+#include "cpu/utils.hpp"
+
+typedef uint32_t __attribute__((__may_alias__)) u32_alias_t;
+typedef uint16_t __attribute__((__may_alias__)) u16_alias_t;
+typedef float __attribute__((__may_alias__)) f32_alias_t;
+
+// Reference scalar dequant — used to verify vectorized AMX dequant.
+inline float fp8e4m3_to_float_scalar(uint8_t b, float scale) noexcept {
+ // NaN encoding in E4M3
+ if ((b & 0x7F) == 0x7F) return std::numeric_limits::quiet_NaN();
+ uint32_t b_u32 = static_cast(b);
+ uint32_t sign = (b_u32 & 0x80) << 24;
+ uint32_t payload = (b_u32 & 0x7F) << 20;
+ uint32_t bits = sign | payload;
+ float b_f32_unscaled = *reinterpret_cast(&bits);
+ float b_f32_scaled = b_f32_unscaled * scale * 0x1p120f;
+ return b_f32_scaled;
+}
+
+inline uint8_t float_to_fp8e4m3_scalar(float v, float inv_scale) noexcept {
+ v *= inv_scale;
+ constexpr float fp8_max = 448.0f;
+ v = std::max(-fp8_max, std::min(fp8_max, v));
+ if (v == 0.0f) return 0;
+
+ // Inverse mapping of fp8e4m3_to_float_scalar: shift the effective exponent
+ // bias from fp32 (127) back to fp8 e4m3 (7), then pack sign|payload.
+ float v_f32_unscaled = v * 0x1p-120f;
+ uint32_t bits = *reinterpret_cast(&v_f32_unscaled);
+ uint8_t sign = static_cast((bits >> 24) & 0x80);
+ uint8_t payload = static_cast((bits >> 20) & 0x7F);
+ if (payload == 0) return sign;
+ payload = std::min(payload, 0x7E); // keep 0x7F as NaN encoding
+ return static_cast(sign | payload);
+}
+
+// ---------------------------------------------------------------------------
+// AMX reshape impl — parameterised on the quantisation function.
+// Writes key/value into uint8 FP8 KV cache using the AMX tile-friendly layout.
+// K: halfword-packed (2 FP8 per uint16, token_num_per_group=16).
+// V: sub-group packing (token_num_per_sub_group=2, head_elems_per_group=16).
+// block_size must be divisible by 32.
+// ---------------------------------------------------------------------------
+template
+inline void reshape_and_cache_fp8_amx_impl(
+ const scalar_t* key_ptr, const scalar_t* value_ptr, uint8_t* key_cache_ptr,
+ uint8_t* value_cache_ptr, const int64_t* slot_ptr, int64_t token_num,
+ int64_t head_num, int64_t head_dim, int64_t block_size, int64_t k_stride0,
+ int64_t k_stride1, int64_t v_stride0, int64_t v_stride1, int64_t kc_stride0,
+ int64_t kc_stride1, int64_t vc_stride0, int64_t vc_stride1, float k_inv,
+ float v_inv) {
+ constexpr int64_t token_num_per_group = 16; // AMX_TILE_ROW_NUM
+ const int64_t halfword_num = head_dim / 2; // 2 FP8 per uint16
+ const int64_t halfword_num_per_group = token_num_per_group * halfword_num;
+ constexpr int64_t head_elems_per_group = 16;
+ constexpr int64_t token_num_per_sub_group = 2; // = 4 / sizeof(BF16)
+ const int64_t group_num = head_dim / head_elems_per_group;
+ const int64_t group_size = block_size * head_elems_per_group;
+
+#pragma omp parallel for collapse(2) schedule(static)
+ for (int64_t tok = 0; tok < token_num; ++tok) {
+ for (int64_t h = 0; h < head_num; ++h) {
+ const int64_t slot = slot_ptr[tok];
+ if (slot < 0) continue;
+ const int64_t block_idx = slot / block_size;
+ const int64_t block_offset = slot % block_size;
+
+ // Key: halfword-packed, 2 FP8 per uint16
+ {
+ const scalar_t* ksrc = key_ptr + tok * k_stride0 + h * k_stride1;
+ const int64_t group_idx = block_offset / token_num_per_group;
+ const int64_t group_offset = block_offset % token_num_per_group;
+ uint16_t* kdst =
+ reinterpret_cast