diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml index a60a4194e9b3..21ffa1b9b8d7 100644 --- a/.buildkite/ci_config.yaml +++ b/.buildkite/ci_config.yaml @@ -8,6 +8,7 @@ run_all_patterns: - "CMakeLists.txt" - "requirements/common.txt" - "requirements/cuda.txt" + - "requirements/kv_connectors.txt" - "requirements/build/cuda.txt" - "requirements/test/cuda.txt" - "setup.py" diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml index 9b1044443780..6189d5d61e6a 100644 --- a/.buildkite/hardware_tests/cpu.yaml +++ b/.buildkite/hardware_tests/cpu.yaml @@ -12,15 +12,19 @@ steps: - vllm/_custom_ops.py - tests/kernels/attention/test_cpu_attn.py - tests/kernels/moe/test_cpu_fused_moe.py + - tests/kernels/moe/test_cpu_quant_fused_moe.py - tests/kernels/test_onednn.py - tests/kernels/test_awq_int4_to_int8.py + - tests/kernels/quantization/test_cpu_fp8_scaled_mm.py commands: - | - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m " + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m " pytest -x -v -s tests/kernels/attention/test_cpu_attn.py pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py + pytest -x -v -s tests/kernels/moe/test_cpu_quant_fused_moe.py pytest -x -v -s tests/kernels/test_onednn.py - pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py" + pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py + pytest -x -v -s tests/kernels/quantization/test_cpu_fp8_scaled_mm.py" - label: CPU-Compatibility Tests depends_on: [] @@ -57,23 +61,24 @@ steps: source_file_dependencies: - csrc/cpu/ - vllm/model_executor/layers/quantization/cpu_wna16.py - - vllm/model_executor/layers/quantization/gptq_marlin.py + - vllm/model_executor/layers/quantization/auto_gptq.py - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py + - vllm/model_executor/layers/fused_moe/experts/cpu_moe.py - tests/quantization/test_compressed_tensors.py - tests/quantization/test_cpu_wna16.py commands: - | - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m " + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m " pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs pytest -x -v -s tests/quantization/test_cpu_wna16.py" -- label: CPU-Distributed Tests +- label: CPU-Distributed Tests (PP+TP) depends_on: [] device: intel_cpu no_plugin: true - source_file_dependencies: + source_file_dependencies: &cpu_distributed_deps - csrc/cpu/shm.cpp - vllm/v1/worker/cpu_worker.py - vllm/v1/worker/gpu_worker.py @@ -82,10 +87,21 @@ steps: - vllm/platforms/cpu.py - vllm/distributed/parallel_state.py - vllm/distributed/device_communicators/cpu_communicator.py + - .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh + commands: + - | + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m " + bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh tp_pp" + +- label: CPU-Distributed Tests (DP+TP) + depends_on: [] + device: intel_cpu + no_plugin: true + source_file_dependencies: *cpu_distributed_deps commands: - | bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m " - bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh" + bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh dp_tp" - label: CPU-Multi-Modal Model Tests %N depends_on: [] diff --git a/.buildkite/hardware_tests/intel.yaml b/.buildkite/hardware_tests/intel.yaml index ba0088b3af62..d70ce28428d4 100644 --- a/.buildkite/hardware_tests/intel.yaml +++ b/.buildkite/hardware_tests/intel.yaml @@ -8,10 +8,3 @@ steps: commands: - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh - - label: "Intel GPU Test" - depends_on: [] - soft_fail: true - device: intel_gpu - no_plugin: true - commands: - - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh index 00ae34bba6d7..10c03c3e1773 100755 --- a/.buildkite/image_build/image_build.sh +++ b/.buildkite/image_build/image_build.sh @@ -192,6 +192,7 @@ export BUILDKITE_COMMIT export PARENT_COMMIT export IMAGE_TAG export IMAGE_TAG_LATEST +export COMMIT="${COMMIT:-${BUILDKITE_COMMIT}}" export CACHE_FROM export CACHE_FROM_BASE_BRANCH export CACHE_FROM_MAIN diff --git a/.buildkite/image_build/image_build_torch_nightly.sh b/.buildkite/image_build/image_build_torch_nightly.sh index a23c658d46b9..cbd08aa7bd0b 100755 --- a/.buildkite/image_build/image_build_torch_nightly.sh +++ b/.buildkite/image_build/image_build_torch_nightly.sh @@ -46,7 +46,7 @@ echo "Image not found, proceeding with build..." # --- CUDA 13.0 for nightly builds --- # Nightly CI uses CUDA 13.0 while regular CI stays on CUDA 12.9 -NIGHTLY_CUDA_VERSION="13.0.0" +NIGHTLY_CUDA_VERSION="13.0.2" NIGHTLY_BUILD_BASE_IMAGE="nvidia/cuda:${NIGHTLY_CUDA_VERSION}-devel-ubuntu22.04" NIGHTLY_FINAL_BASE_IMAGE="nvidia/cuda:${NIGHTLY_CUDA_VERSION}-base-ubuntu22.04" diff --git a/.buildkite/intel_jobs/engine_intel.yaml b/.buildkite/intel_jobs/engine_intel.yaml new file mode 100644 index 000000000000..c66576d40991 --- /dev/null +++ b/.buildkite/intel_jobs/engine_intel.yaml @@ -0,0 +1,21 @@ +group: Engine Intel +depends_on: + - image-build-xpu +steps: +- label: Engine (1 GPU) + timeout_in_minutes: 30 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/v1/engine/ + - tests/v1/engine/ + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py' diff --git a/.buildkite/intel_jobs/kernels_intel.yaml b/.buildkite/intel_jobs/kernels_intel.yaml new file mode 100644 index 000000000000..66a8db25f02e --- /dev/null +++ b/.buildkite/intel_jobs/kernels_intel.yaml @@ -0,0 +1,21 @@ +group: Kernels Intel +depends_on: + - image-build-xpu +steps: +- label: vLLM IR Tests + timeout_in_minutes: 30 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/ir + - vllm/kernels + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + pytest -v -s kernels/ir' diff --git a/.buildkite/intel_jobs/lora_intel.yaml b/.buildkite/intel_jobs/lora_intel.yaml new file mode 100644 index 000000000000..32a56ef59b3f --- /dev/null +++ b/.buildkite/intel_jobs/lora_intel.yaml @@ -0,0 +1,135 @@ +group: LoRA Intel +depends_on: + - image-build-xpu +steps: +- label: LoRA Runtime + Utils + timeout_in_minutes: 45 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + pytest -v -s lora/test_layers.py && + pytest -v -s lora/test_lora_checkpoints.py && + pytest -v -s lora/test_lora_functions.py && + pytest -v -s lora/test_lora_huggingface.py && + pytest -v -s lora/test_lora_manager.py && + pytest -v -s lora/test_lora_utils.py && + pytest -v -s lora/test_peft_helper.py && + pytest -v -s lora/test_resolver.py && + pytest -v -s lora/test_utils.py && + pytest -v -s lora/test_add_lora.py && + pytest -v -s lora/test_worker.py' + +- label: LoRA Fused/MoE Kernels + timeout_in_minutes: 45 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + pytest -v -s lora/test_fused_moe_lora_kernel.py && + pytest -v -s lora/test_moe_lora_align_sum.py --deselect="tests/lora/test_moe_lora_align_sum.py::test_moe_lora_align_block_size_mixed_base_and_lora[1]"' + +- label: LoRA Punica Kernels + timeout_in_minutes: 45 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + set -o pipefail && + pytest -v -s lora/test_punica_ops.py --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-3-43264-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype1-1-2049-64-128-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-1-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-1-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-8-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype0-3-2049-128-8-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-8-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype1-1-2049-256-128-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-3-64256-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-2-29696-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-3-49408-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-2-16384-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-2-51328-32-4-4]"' + +- label: LoRA Punica FP8/XPU Ops + timeout_in_minutes: 45 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + pytest -v -s lora/test_punica_ops_fp8.py && + pytest -v -s lora/test_punica_xpu_ops.py' + +- label: LoRA Models + timeout_in_minutes: 45 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + (pytest -v -s lora/test_mixtral.py --deselect="tests/lora/test_mixtral.py::test_mixtral_lora[4]" || true) && + pytest -v -s lora/test_quant_model.py --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model0]" --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model1]" --deselect="tests/lora/test_quant_model.py::test_quant_model_tp_equality[model0]" && + pytest -v -s lora/test_transformers_model.py && + pytest -v -s lora/test_chatglm3_tp.py && + pytest -s -v lora/test_minicpmv_tp.py' + +- label: LoRA Multimodal + timeout_in_minutes: 45 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + pytest -v -s lora/test_default_mm_loras.py && + pytest -v -s lora/test_whisper.py' diff --git a/.buildkite/intel_jobs/misc_intel.yaml b/.buildkite/intel_jobs/misc_intel.yaml new file mode 100644 index 000000000000..864128bb5338 --- /dev/null +++ b/.buildkite/intel_jobs/misc_intel.yaml @@ -0,0 +1,55 @@ +group: Miscellaneous Intel +depends_on: + - image-build-xpu +steps: +- label: V1 Core + KV + Metrics + timeout_in_minutes: 30 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/ + - tests/v1/core + - tests/v1/executor + - tests/v1/kv_offload + - tests/v1/worker + - tests/v1/kv_connector/unit + - tests/v1/metrics + - tests/entrypoints/openai/correctness/test_lmeval.py + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'pip install -r requirements/kv_connectors.txt && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + cd tests && + pytest -v -s v1/executor' + +- label: V1 Sample + Logits + timeout_in_minutes: 30 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/ + - tests/v1/sample + - tests/v1/logits_processors + - tests/v1/test_oracle.py + - tests/v1/test_request.py + - tests/v1/test_outputs.py + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'export VLLM_WORKER_MULTIPROC_METHOD=spawn && + cd tests && + pytest -v -s v1/logits_processors --ignore=v1/logits_processors/test_custom_online.py --ignore=v1/logits_processors/test_custom_offline.py && + pytest -v -s v1/test_oracle.py && + pytest -v -s v1/test_request.py && + pytest -v -s v1/test_outputs.py' diff --git a/.buildkite/intel_jobs/test-intel.yaml b/.buildkite/intel_jobs/test-intel.yaml index c59be699502f..c14a6f0f4f81 100644 --- a/.buildkite/intel_jobs/test-intel.yaml +++ b/.buildkite/intel_jobs/test-intel.yaml @@ -36,9 +36,12 @@ steps: python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN && python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 && python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --kv-cache-dtype fp8 && + python3 examples/basic/offline_inference/generate.py --model nvidia/Llama-3.1-8B-Instruct-FP8 --block-size 64 --enforce-eager --quantization modelopt --kv-cache-dtype fp8 --attention-backend TRITON_ATTN --max-model-len 4096 && python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192 && python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 && - python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel' + python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel && + python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --max-model-len 8192 + ' - label: "XPU V1 test" depends_on: - image-build-xpu @@ -61,5 +64,5 @@ steps: pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py && pytest -v -s v1/structured_output && pytest -v -s v1/test_serial_utils.py && - pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py && - pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py --ignore=v1/kv_connector/unit/test_hf3fs_client.py --ignore=v1/kv_connector/unit/test_hf3fs_connector.py --ignore=v1/kv_connector/unit/test_hf3fs_metadata_server.py' + pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py && + pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py --ignore=v1/kv_connector/unit/test_hf3fs_client.py --ignore=v1/kv_connector/unit/test_hf3fs_connector.py --ignore=v1/kv_connector/unit/test_hf3fs_metadata_server.py --ignore=v1/kv_connector/unit/test_offloading_connector.py' diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh index 518af9a66018..b495c0d123a6 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on chartqa for vllm. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.11" +# pip install "lm-eval[api]>=0.4.12" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh index f010ffe6752d..e430e6183b2d 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on GSM for transformers. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.11" +# pip install "lm-eval[api]>=0.4.12" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index fec4a94e63e4..f1a541ddbefc 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.11" +# pip install "lm-eval[api]>=0.4.12" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh index e3c6e16bd6b3..ba8da9fc3f55 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.11" +# pip install "lm-eval[api]>=0.4.12" usage() { echo`` diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json index 6c4591f05b3b..34c2cc82d395 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json @@ -31,30 +31,9 @@ } }, "tests": [ - { - "test_name": "serving_llama8B_tp1_sharegpt", - "server_parameters": { - "tensor_parallel_size": 1 - }, - "client_parameters": { - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" - } - }, - { - "test_name": "serving_llama8B_tp2_sharegpt", - "server_parameters": { - "tensor_parallel_size": 2 - }, - "client_parameters": { - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" - } - }, { "test_name": "serving_llama8B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 1 }, "client_parameters": { "dataset_name": "random", @@ -63,290 +42,244 @@ } }, { - "test_name": "serving_llama8B_tp2_random_128_128", + "test_name": "serving_llama8B_int4_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 2 + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4" }, "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_tp4_random_128_128", + "test_name": "serving_llama8B_int8_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 4 + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8" }, "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_tp1_random_128_2048", - "server_parameters": { - "tensor_parallel_size": 1 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 2048 - } - }, - { - "test_name": "serving_llama8B_tp2_random_128_2048", + "test_name": "serving_llama1B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 2 + "model": "meta-llama/Llama-3.2-1B" }, "client_parameters": { + "model": "meta-llama/Llama-3.2-1B", "dataset_name": "random", "random-input-len": 128, - "random-output-len": 2048 + "random-output-len": 128 } }, { - "test_name": "serving_llama8B_tp4_random_128_2048", + "test_name": "serving_llama3B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 4 + "model": "meta-llama/Llama-3.2-3B-Instruct" }, "client_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", "dataset_name": "random", "random-input-len": 128, - "random-output-len": 2048 - } - }, - { - "test_name": "serving_llama8B_tp1_random_2048_128", - "server_parameters": { - "tensor_parallel_size": 1 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 2048, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_tp2_random_2048_128", + "test_name": "serving_llama70B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 2 + "model": "meta-llama/Llama-3.3-70B-Instruct" }, "client_parameters": { + "model": "meta-llama/Llama-3.3-70B-Instruct", "dataset_name": "random", - "random-input-len": 2048, + "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_tp4_random_2048_128", + "test_name": "serving_granite2B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 4 + "model": "ibm-granite/granite-3.2-2b-instruct" }, "client_parameters": { + "model": "ibm-granite/granite-3.2-2b-instruct", "dataset_name": "random", - "random-input-len": 2048, + "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_tp1_random_2048_2048", - "server_parameters": { - "tensor_parallel_size": 1 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 2048, - "random-output-len": 2048 - } - }, - { - "test_name": "serving_llama8B_tp2_random_2048_2048", - "server_parameters": { - "tensor_parallel_size": 2 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 2048, - "random-output-len": 2048 - } - }, - { - "test_name": "serving_llama8B_tp4_random_2048_2048", - "server_parameters": { - "tensor_parallel_size": 4 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 2048, - "random-output-len": 2048 - } - }, - { - "test_name": "serving_llama8B_int4_tp1_random_128_128", + "test_name": "serving_qwen1.7B_tp1_random_128_128", "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 1 + "model": "Qwen/Qwen3-1.7B" }, "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "model": "Qwen/Qwen3-1.7B", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_int4_tp2_random_128_128", + "test_name": "serving_qwen4B_tp1_random_128_128", "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 2 + "model": "Qwen/Qwen3-4B" }, "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "model": "Qwen/Qwen3-4B", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_int4_tp4_random_128_128", + "test_name": "serving_qwen8B_tp1_random_128_128", "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 4 + "model": "Qwen/Qwen3-8B" }, "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "model": "Qwen/Qwen3-8B", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_int8_tp1_random_128_128", + "test_name": "serving_qwen14B_tp1_random_128_128", "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 1 + "model": "Qwen/Qwen3-14B" }, "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model": "Qwen/Qwen3-14B", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_int8_tp2_random_128_128", + "test_name": "serving_qwen30B_tp1_random_128_128", "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2 + "model": "Qwen/Qwen3-30B-A3B" }, "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model": "Qwen/Qwen3-30B-A3B", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_int8_tp4_random_128_128", + "test_name": "serving_glm9B_tp1_random_128_128", "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 4 + "model": "zai-org/glm-4-9b-hf" }, "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model": "zai-org/glm-4-9b-hf", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama3B_tp1_random_128_128", + "test_name": "serving_gemma7B_tp1_random_128_128", "server_parameters": { - "model": "meta-llama/Llama-3.2-3B-Instruct", - "tensor_parallel_size": 1 + "model": "google/gemma-7b" }, "client_parameters": { - "model": "meta-llama/Llama-3.2-3B-Instruct", + "model": "google/gemma-7b", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_granite2B_tp1_random_128_128", + "test_name": "serving_gemma3-4b_tp1_random_128_128", + "server_environment_variables": { + "VLLM_CPU_SGL_KERNEL": 0 + }, "server_parameters": { - "model": "ibm-granite/granite-3.2-2b-instruct", - "tensor_parallel_size": 1 + "model": "google/gemma-3-4b-it" }, "client_parameters": { - "model": "ibm-granite/granite-3.2-2b-instruct", + "model": "google/gemma-3-4b-it", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_qwen1.7B_tp1_random_128_128", + "test_name": "serving_gemma3-12b_tp1_random_128_128", + "server_environment_variables": { + "VLLM_CPU_SGL_KERNEL": 0 + }, "server_parameters": { - "model": "Qwen/Qwen3-1.7B", - "tensor_parallel_size": 1 + "model": "google/gemma-3-12b-it" }, "client_parameters": { - "model": "Qwen/Qwen3-1.7B", + "model": "google/gemma-3-12b-it", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_qwen4B_tp1_random_128_128", + "test_name": "serving_gemma4-4b_tp1_random_128_128", + "server_environment_variables": { + "VLLM_CPU_SGL_KERNEL": 0 + }, "server_parameters": { - "model": "Qwen/Qwen3-4B", - "tensor_parallel_size": 1 + "model": "google/gemma-4-E4B-it" }, "client_parameters": { - "model": "Qwen/Qwen3-4B", + "model": "google/gemma-4-E4B-it", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_qwen8B_tp1_random_128_128", + "test_name": "serving_gemma4-2b_tp1_random_128_128", + "server_environment_variables": { + "VLLM_CPU_SGL_KERNEL": 0 + }, "server_parameters": { - "model": "Qwen/Qwen3-8B", - "tensor_parallel_size": 1 + "model": "google/gemma-4-E2B-it" }, "client_parameters": { - "model": "Qwen/Qwen3-8B", + "model": "google/gemma-4-E2B-it", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_glm9B_tp1_random_128_128", + "test_name": "serving_gemma4-26b_tp1_random_128_128", + "server_environment_variables": { + "VLLM_CPU_SGL_KERNEL": 0, + "VLLM_CPU_ATTN_SPLIT_KV": 0 + }, "server_parameters": { - "model": "zai-org/glm-4-9b-hf", - "tensor_parallel_size": 1 + "model": "google/gemma-4-26B-A4B-it" }, "client_parameters": { - "model": "zai-org/glm-4-9b-hf", + "model": "google/gemma-4-26B-A4B-it", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_gemma7B_tp1_random_128_128", + "test_name": "serving_phi4_tp1_random_128_128", "server_parameters": { - "model": "google/gemma-7b", - "tensor_parallel_size": 1 + "model": "microsoft/Phi-4-reasoning" }, "client_parameters": { - "model": "google/gemma-7b", + "model": "microsoft/Phi-4-reasoning", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index b3a6bb8ed4cf..df9b80f7f9a8 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,3 +1,16 @@ +# CUDA architecture lists — following PyTorch RELEASE.md +# (https://github.com/pytorch/pytorch/blob/main/RELEASE.md) +# SM86 included for broader Ampere coverage; SM89 for marlin fp8 support +env: + CUDA_ARCH_X86: "7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX" + # aarch64 only architectures: 8.7 for Orin, 11.0 for Thor (since CUDA 13) + CUDA_ARCH_AARCH64: "8.0 8.7 8.9 9.0 10.0 11.0 12.0+PTX" + CUDA_ARCH_X86_CU129: "7.5 8.0 8.6 8.9 9.0 10.0 12.0" + CUDA_ARCH_AARCH64_CU129: "8.0 8.7 8.9 9.0 10.0 12.0" + MOONCAKE_WHEEL_AARCH64_2_35: "https://vllm-wheels.s3.amazonaws.com/mooncake/mooncake_transfer_engine-0.3.10.post2-0da9dfea3-cp312-cp312-manylinux_2_35_aarch64.whl" + MOONCAKE_WHEEL_AARCH64_2_39: "https://vllm-wheels.s3.amazonaws.com/mooncake/mooncake_transfer_engine-0.3.10.post2-0da9dfea3-cp312-cp312-manylinux_2_39_aarch64.whl" + MOONCAKE_WHEEL_X86_64: "https://vllm-wheels.s3.amazonaws.com/mooncake/mooncake_transfer_engine-0.3.10.post2-0da9dfea3-cp312-cp312-manylinux_2_35_x86_64.whl" + steps: - input: "Provide Release version here" id: input-release-version @@ -14,12 +27,11 @@ steps: agents: queue: arm64_cpu_queue_release commands: - # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: - # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64_CU129}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinuxaarch64-builder:cuda12.9 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-nightly-wheels.sh" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"' env: DOCKER_BUILDKIT: "1" @@ -29,12 +41,11 @@ steps: agents: queue: arm64_cpu_queue_release commands: - # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: - # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinuxaarch64-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"' env: DOCKER_BUILDKIT: "1" @@ -47,7 +58,8 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"' env: DOCKER_BUILDKIT: "1" @@ -57,10 +69,11 @@ steps: agents: queue: cpu_queue_release commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86_CU129}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinux2_28-builder:cuda12.9 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"' env: DOCKER_BUILDKIT: "1" @@ -70,10 +83,11 @@ steps: agents: queue: cpu_queue_release commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinux2_28-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"' env: DOCKER_BUILDKIT: "1" @@ -86,7 +100,8 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"' env: DOCKER_BUILDKIT: "1" @@ -108,102 +123,226 @@ steps: depends_on: block-build-release-images allow_dependency_failure: true steps: - - label: "Build release image - x86_64 - CUDA 12.9" + - label: "Build release image - x86_64 - CUDA 13.0" depends_on: ~ id: build-release-image-x86 agents: queue: cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=13.0.2 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" # re-tag to default image tag and push, just in case arm64 build fails - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"' - - label: "Build release image - aarch64 - CUDA 12.9" + - label: "Build release image - aarch64 - CUDA 13.0" depends_on: ~ id: build-release-image-arm64 agents: queue: arm64_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=13.0.2 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"' - - label: "Build release image - x86_64 - CUDA 13.0" + - label: "Build release image - x86_64 - CUDA 12.9" depends_on: ~ - id: build-release-image-x86-cuda-13-0 + id: build-release-image-x86-cuda-12-9 agents: queue: cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130" + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=12.9.1 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86_CU129}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129" # re-tag to default image tag and push, just in case arm64 build fails - - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130" - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130" + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129" + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"' - - label: "Build release image - aarch64 - CUDA 13.0" + - label: "Build release image - aarch64 - CUDA 12.9" depends_on: ~ - id: build-release-image-arm64-cuda-13-0 + id: build-release-image-arm64-cuda-12-9 agents: queue: arm64_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130" + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=12.9.1 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64_CU129}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"' - - label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04" + - label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04" depends_on: ~ id: build-release-image-x86-ubuntu2404 agents: queue: cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh ubuntu2404) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=13.0.2 \ + --build-arg UBUNTU_VERSION=24.04 \ + --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404" - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"' - - label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04" + - label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04" depends_on: ~ id: build-release-image-arm64-ubuntu2404 agents: queue: arm64_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh ubuntu2404) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=13.0.2 \ + --build-arg UBUNTU_VERSION=24.04 \ + --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"' - - label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04" + - label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04" depends_on: ~ - id: build-release-image-x86-cuda-13-0-ubuntu2404 + id: build-release-image-x86-cuda-12-9-ubuntu2404 agents: queue: cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404" - - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404" - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404" + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129-ubuntu2404) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=12.9.1 \ + --build-arg UBUNTU_VERSION=24.04 \ + --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86_CU129}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404" + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404" + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"' - - label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04" + - label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04" depends_on: ~ - id: build-release-image-arm64-cuda-13-0-ubuntu2404 + id: build-release-image-arm64-cuda-12-9-ubuntu2404 agents: queue: arm64_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404" + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129-ubuntu2404) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=12.9.1 \ + --build-arg UBUNTU_VERSION=24.04 \ + --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64_CU129}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"' - block: "Build release image for x86_64 CPU" key: block-cpu-release-image-build depends_on: ~ - label: "Build release image - x86_64 - CPU" + key: build-cpu-release-image-x86 depends_on: - block-cpu-release-image-build - input-release-version @@ -214,6 +353,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"' env: DOCKER_BUILDKIT: "1" @@ -222,7 +362,8 @@ steps: depends_on: ~ - label: "Build release image - arm64 - CPU" - depends_on: + key: build-cpu-release-image-arm64 + depends_on: - block-arm64-cpu-release-image-build - input-release-version agents: @@ -232,13 +373,14 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest" - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"' env: DOCKER_BUILDKIT: "1" - group: "Publish release images" key: "publish-release-images" steps: - - label: "Create multi-arch manifest - CUDA 12.9" + - label: "Create multi-arch manifest - CUDA 13.0" depends_on: - build-release-image-x86 - build-release-image-arm64 @@ -249,29 +391,22 @@ steps: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend" - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 13.0" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"' - - label: "Annotate release workflow - CUDA 12.9" - depends_on: - - create-multi-arch-manifest - id: annotate-release-workflow - agents: - queue: small_cpu_queue_release - commands: - - "bash .buildkite/scripts/annotate-release.sh" - - - label: "Create multi-arch manifest - CUDA 13.0" + - label: "Create multi-arch manifest - CUDA 12.9" depends_on: - - build-release-image-x86-cuda-13-0 - - build-release-image-arm64-cuda-13-0 - id: create-multi-arch-manifest-cuda-13-0 + - build-release-image-x86-cuda-12-9 + - build-release-image-arm64-cuda-12-9 + id: create-multi-arch-manifest-cuda-12-9 agents: queue: small_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend" - - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130" + - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu129 --amend" + - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 12.9" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129"' - - label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04" + - label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04" depends_on: - build-release-image-x86-ubuntu2404 - build-release-image-arm64-ubuntu2404 @@ -282,18 +417,20 @@ steps: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-ubuntu2404 --amend" - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 13.0 Ubuntu 24.04" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"' - - label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04" + - label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04" depends_on: - - build-release-image-x86-cuda-13-0-ubuntu2404 - - build-release-image-arm64-cuda-13-0-ubuntu2404 - id: create-multi-arch-manifest-cuda-13-0-ubuntu2404 + - build-release-image-x86-cuda-12-9-ubuntu2404 + - build-release-image-arm64-cuda-12-9-ubuntu2404 + id: create-multi-arch-manifest-cuda-12-9-ubuntu2404 agents: queue: small_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130-ubuntu2404 --amend" - - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404" + - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu129-ubuntu2404 --amend" + - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 12.9 Ubuntu 24.04" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404"' - label: "Publish nightly multi-arch image to DockerHub" depends_on: @@ -313,16 +450,16 @@ steps: DOCKER_BUILDKIT: "1" DOCKERHUB_USERNAME: "vllmbot" - - label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0" + - label: "Publish nightly multi-arch image to DockerHub - CUDA 12.9" depends_on: - - create-multi-arch-manifest-cuda-13-0 + - create-multi-arch-manifest-cuda-12-9 if: build.env("NIGHTLY") == "1" agents: queue: small_cpu_queue_release commands: - - "bash .buildkite/scripts/push-nightly-builds.sh cu130" + - "bash .buildkite/scripts/push-nightly-builds.sh cu129" # Clean up old nightly builds (keep only last 14) - - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-" + - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu129-nightly-" plugins: - docker-login#v3.0.0: username: vllmbot @@ -331,24 +468,6 @@ steps: DOCKER_BUILDKIT: "1" DOCKERHUB_USERNAME: "vllmbot" - - group: "Publish wheels" - key: "publish-wheels" - steps: - - block: "Confirm update release wheels to PyPI (experimental, use with caution)?" - key: block-upload-release-wheels - depends_on: - - input-release-version - - build-wheels - - - label: "Upload release wheels to PyPI" - depends_on: - - block-upload-release-wheels - id: upload-release-wheels - agents: - queue: small_cpu_queue_release - commands: - - "bash .buildkite/scripts/upload-release-wheels-pypi.sh" - # ============================================================================= # ROCm Release Pipeline (x86_64 only) # ============================================================================= @@ -462,7 +581,7 @@ steps: echo "" echo " Build complete - Image and wheels cached" fi - + artifact_paths: - "artifacts/rocm-base-wheels/*.whl" env: @@ -618,7 +737,7 @@ steps: - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh" env: S3_BUCKET: "vllm-wheels" - VARIANT: "rocm721" + VARIANT: "rocm722" # ROCm Job 6: Build ROCm Release Docker Image - label: ":docker: Build release image - x86_64 - ROCm" @@ -678,7 +797,7 @@ steps: # Push to ECR docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm - + echo "" echo " Successfully built and pushed ROCm release image" echo " Image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm" @@ -705,3 +824,60 @@ steps: env: DOCKER_BUILDKIT: "1" DOCKERHUB_USERNAME: "vllmbot" + + # ============================================================================= + # Publish to DockerHub and PyPI (at the end so all builds complete first) + # ============================================================================= + + - block: "Publish release images to DockerHub" + key: block-publish-release-images + depends_on: + - create-multi-arch-manifest + - create-multi-arch-manifest-cuda-12-9 + - create-multi-arch-manifest-ubuntu2404 + - create-multi-arch-manifest-cuda-12-9-ubuntu2404 + - build-rocm-release-image + - input-release-version + # Wait for CPU builds if their block steps were unblocked, so publish + # doesn't race the in-progress CPU build. allow_failure lets publish + # proceed when the operator legitimately leaves the CPU block steps + # unblocked or the CPU build fails. + - step: build-cpu-release-image-x86 + allow_failure: true + - step: build-cpu-release-image-arm64 + allow_failure: true + if: build.env("NIGHTLY") != "1" + + - label: "Publish release images to DockerHub" + depends_on: + - block-publish-release-images + key: publish-release-images-dockerhub + agents: + queue: small_cpu_queue_release + commands: + - "bash .buildkite/scripts/publish-release-images.sh" + plugins: + - docker-login#v3.0.0: + username: vllmbot + password-env: DOCKERHUB_TOKEN + env: + DOCKER_BUILDKIT: "1" + DOCKERHUB_USERNAME: "vllmbot" + + - group: "Publish wheels" + key: "publish-wheels" + steps: + - block: "Confirm update release wheels to PyPI (experimental, use with caution)?" + key: block-upload-release-wheels + depends_on: + - input-release-version + - build-wheels + + - label: "Upload release wheels to PyPI" + depends_on: + - block-upload-release-wheels + id: upload-release-wheels + agents: + queue: small_cpu_queue_release + commands: + - "bash .buildkite/scripts/upload-release-wheels-pypi.sh" diff --git a/.buildkite/scripts/annotate-build-artifact.sh b/.buildkite/scripts/annotate-build-artifact.sh new file mode 100755 index 000000000000..67cdf7923658 --- /dev/null +++ b/.buildkite/scripts/annotate-build-artifact.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Append a build artifact line to the Buildkite annotation. +# Usage: annotate-build-artifact.sh