From c219c11096cb119690e9d375f33e33596b89e012 Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Wed, 6 May 2026 14:27:31 +0000 Subject: [PATCH 1/3] check distributed test groups without TORCH_NCCL_BLOCKING_WAIT=1 Signed-off-by: Micah Williamson --- .buildkite/test-amd.yaml | 3071 ++------------------------------------ 1 file changed, 159 insertions(+), 2912 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 5b3eb4f79c5d..a6afcd5bac8d 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -114,101 +114,12 @@ steps: #----------------------------------------------------- mi250 · basic_correctness -----------------------------------------------------# -- label: Distributed Model Tests (2 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_2 - num_gpus: 2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/model_executor/model_loader/sharded_state_loader.py - - vllm/model_executor/models/ - - vllm/model_executor/layers/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - tests/basic_correctness/ - - tests/model_executor/model_loader/test_sharded_state_loader.py - - tests/models/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py -m '(not slow_test)' - - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/language -v -s -m 'distributed(num_gpus=2)' - - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py - - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' #-------------------------------------------------------- mi250 · benchmarks ---------------------------------------------------------# -- label: Benchmarks # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - working_dir: "/vllm-workspace/.buildkite" - source_file_dependencies: - - benchmarks/ - - vllm/platforms/rocm.py - commands: - - bash scripts/run-benchmarks.sh #---------------------------------------------------------- mi250 · compile ----------------------------------------------------------# -- label: PyTorch Compilation Unit Tests # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - torch_nightly: true - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/compilation/ - - vllm/model_executor/layers/ - - vllm/v1/worker/ - - vllm/v1/attention/ - - vllm/v1/cudagraph_dispatcher.py - - vllm/config/compilation.py - - csrc/ - - tests/compile - - vllm/platforms/rocm.py - commands: - - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - -- label: PyTorch Fullgraph # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/compilation/ - - vllm/model_executor/ - - vllm/v1/attention/ - - vllm/config/compilation.py - - csrc/ - - tests/compile - - vllm/platforms/rocm.py - commands: - - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - -- label: PyTorch Fullgraph Smoke Test # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/compilation/ - - vllm/model_executor/ - - vllm/v1/attention/ - - vllm/config/compilation.py - - csrc/ - - tests/compile - - vllm/platforms/rocm.py - commands: - - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" - label: Distributed Compile + RPC Tests (2 GPUs) # TBD timeout_in_minutes: 180 @@ -230,28 +141,12 @@ steps: - tests/entrypoints/llm/test_collective_rpc.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py #-------------------------------------------------------- mi250 · distributed --------------------------------------------------------# -- label: Distributed Comm Ops # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_2 - num_gpus: 2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed - - tests/distributed - - vllm/platforms/rocm.py - commands: - - pytest -v -s distributed/test_comm_ops.py - - pytest -v -s distributed/test_shm_broadcast.py - - pytest -v -s distributed/test_shm_buffer.py - - pytest -v -s distributed/test_shm_storage.py - label: Distributed Torchrun + Shutdown Tests (2 GPUs) # TBD timeout_in_minutes: 180 @@ -272,303 +167,32 @@ steps: - tests/v1/worker/test_worker_memory_snapshot.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py -- label: Elastic EP Scaling Test # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_4 - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/compilation/ - - tests/distributed/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s distributed/test_elastic_ep.py - -- label: EPLB Execution # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_4 - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/eplb - - tests/distributed/test_eplb_execute.py - - tests/distributed/test_eplb_spec_decode.py - - vllm/platforms/rocm.py - commands: - - pytest -v -s distributed/test_eplb_execute.py - - pytest -v -s distributed/test_eplb_spec_decode.py - -- label: Pipeline + Context Parallelism (4 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_4 - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/model_executor/models/ - - vllm/model_executor/layers/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - tests/distributed/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - pytest -v -s distributed/test_pp_cudagraph.py - - pytest -v -s distributed/test_pipeline_parallel.py #---------------------------------------------------------- mi250 · engine -----------------------------------------------------------# -- label: Engine # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/engine - - tests/test_sequence - - tests/test_config - - tests/test_logger - - tests/test_vllm_port - commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py #----------------------------------------------------------- mi250 · evals -----------------------------------------------------------# -- label: Multi-Modal Accuracy Eval (Small Models) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - vllm/multimodal/ - - vllm/inputs/ - - vllm/v1/core/ - - vllm/platforms/rocm.py - - vllm/model_executor/model_loader/ - commands: - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 #--------------------------------------------------------- mi250 · examples ----------------------------------------------------------# -- label: Examples # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/examples" - source_file_dependencies: - - vllm/entrypoints - - vllm/multimodal - - examples/ - - vllm/platforms/rocm.py - commands: - - pip install tensorizer - # Basic - - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN - - python3 basic/offline_inference/generate.py --model facebook/opt-125m - - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 basic/offline_inference/classify.py - - python3 basic/offline_inference/embed.py - - python3 basic/offline_inference/score.py - # Multi-modal models - - python3 generate/multimodal/audio_language_offline.py --seed 0 - - python3 generate/multimodal/vision_language_offline.py --seed 0 - - python3 generate/multimodal/vision_language_multi_image_offline.py --seed 0 - - python3 generate/multimodal/encoder_decoder_multimodal_offline.py --model-type whisper --seed 0 - # Pooling models - - python3 pooling/embed/vision_embedding_offline.py --seed 0 - # Features demo - - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi250 · kernels ----------------------------------------------------------# -- label: Kernels Core Operation Test # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/ - - tests/kernels/core - - tests/kernels/test_top_k_per_row.py - - tests/kernels/test_concat_mla_q.py - - vllm/model_executor/layers/rotary_embedding/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - pytest -v -s kernels/core --ignore=kernels/core/test_minimax_reduce_rms.py kernels/test_concat_mla_q.py kernels/test_top_k_per_row.py - -- label: Kernels Helion Test # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/utils/import_utils.py - - tests/kernels/helion/ - - vllm/platforms/rocm.py - commands: - - pip install helion==1.0.0 - - pytest -v -s kernels/helion/ - -- label: Kernels Mamba Test # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/mamba/ - - tests/kernels/mamba - - vllm/model_executor/layers/mamba/ops - - vllm/platforms/rocm.py - commands: - - pytest -v -s kernels/mamba #----------------------------------------------------------- mi250 · lora ------------------------------------------------------------# -- label: LoRA %N # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - parallelism: 4 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/lora - - tests/lora - - vllm/platforms/rocm.py - commands: - - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemodel_lora.py #------------------------------------------------------ mi250 · model_executor -------------------------------------------------------# -- label: Model Executor # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/engine/arg_utils.py - - vllm/config/model.py - - vllm/model_executor - - tests/model_executor - - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - apt-get update && apt-get install -y curl libsodium23 - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s model_executor -m '(not slow_test)' - - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py #---------------------------------------------------------- mi250 · models -----------------------------------------------------------# -- label: Basic Models Test (Other CPU) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - no_gpu: true - optional: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/test_utils.py - - tests/models/test_vision.py - commands: - - pytest -v -s models/test_utils.py models/test_vision.py - -- label: Basic Models Tests (Extra Initialization) %N # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - torch_nightly: true - parallelism: 2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/model_executor/models/ - - vllm/model_executor/layers/ - - tests/models/test_initialization.py - - tests/models/registry.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB - -- label: Basic Models Tests (Initialization) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/test_initialization.py - - tests/models/registry.py - commands: - - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset - -- label: Basic Models Tests (Other) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/test_terratorch.py - - tests/models/test_transformers.py - - tests/models/test_registry.py - commands: - - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py - -- label: Language Models Test (MTEB) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/language/pooling_mteb_test - commands: - - pytest -v -s models/language/pooling_mteb_test - -- label: Language Models Test (PPL) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/language/generation_ppl_test - commands: - - pytest -v -s models/language/generation_ppl_test - label: Language Models Tests (Extra Standard) %N # TBD timeout_in_minutes: 180 @@ -590,264 +214,16 @@ steps: - vllm/platforms/rocm.py commands: - pip freeze | grep -E 'torch' - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB -- label: Multi-Modal Models (Extended Generation 2) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal/generation - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' -- label: Multi-Modal Models (Extended Pooling) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal/pooling - commands: - - pytest -v -s models/multimodal/pooling -m 'not core_model' +#---------------------------------------------------------- mi250 · plugins ----------------------------------------------------------# -- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - torch_nightly: true - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma" - - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model -#---------------------------------------------------------- mi250 · plugins ----------------------------------------------------------# +#------------------------------------------------------------ mi250 · v1 -------------------------------------------------------------# -- label: Plugin Tests (2 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_2 - num_gpus: 2 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/plugins/ - - tests/plugins/ - - vllm/platforms/rocm.py - commands: - # BEGIN: platform plugin and general plugin tests, all the code in-between runs on dummy platform - - pip install -e ./plugins/vllm_add_dummy_platform - - pytest -v -s plugins_tests/test_platform_plugins.py - - pip uninstall vllm_add_dummy_platform -y - # END: platform plugin tests - # BEGIN: `io_processor` plugins test, all the code in between uses the `prithvi_io_processor` plugin - - pip install -e ./plugins/prithvi_io_processor_plugin - - pytest -v -s plugins_tests/test_io_processor_plugins.py - - pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py - - pip uninstall prithvi_io_processor_plugin -y - # END: `io_processor` plugins test - # BEGIN: `bge_m3_sparse io_processor` test - - pip install -e ./plugins/bge_m3_sparse_plugin - - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py - - pip uninstall bge_m3_sparse_plugin -y - # END: `bge_m3_sparse io_processor` test - # BEGIN: `stat_logger` plugins test - - pip install -e ./plugins/vllm_add_dummy_stat_logger - - pytest -v -s plugins_tests/test_stats_logger_plugins.py - - pip uninstall dummy_stat_logger -y - # END: `stat_logger` plugins test - # BEGIN: other tests - - pytest -v -s plugins_tests/test_scheduler_plugins.py - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s distributed/test_distributed_oot.py - - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py - - pytest -v -s models/test_oot_registration.py - - pytest -v -s plugins/lora_resolvers - -#------------------------------------------------------------ mi250 · v1 -------------------------------------------------------------# - -- label: Batch Invariance (H100-MI250) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/attention - - vllm/model_executor/layers - - tests/v1/determinism/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pip install pytest-timeout pytest-forked - - pytest -v -s v1/determinism/test_batch_invariance.py - - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py - -- label: Cudagraph # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - tests/v1/cudagraph - - vllm/v1/cudagraph_dispatcher.py - - vllm/config/compilation.py - - vllm/compilation - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py - - pytest -v -s v1/cudagraph/test_cudagraph_mode.py - -- label: e2e Core (1 GPU) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/ - - tests/v1/e2e/general/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py - -- label: e2e Scheduling (1 GPU) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/ - - tests/v1/e2e/general/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/e2e/general/test_async_scheduling.py - -- label: Engine (1 GPU) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/ - - tests/v1/engine/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/engine/test_preprocess_error_handling.py - - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py - -- label: Spec Decode Draft Model # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/spec_decode/ - - vllm/v1/worker/gpu/spec_decode/ - - vllm/model_executor/model_loader/ - - vllm/v1/sample/ - - vllm/model_executor/layers/ - - tests/v1/e2e/spec_decode/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference" - -- label: Spec Decode Speculators + MTP # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/spec_decode/ - - vllm/v1/worker/gpu/spec_decode/ - - vllm/model_executor/model_loader/ - - vllm/v1/sample/ - - vllm/model_executor/layers/ - - vllm/transformers_utils/configs/speculators/ - - tests/v1/e2e/spec_decode/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness" - -- label: V1 attention (H100-MI250) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention - - vllm/_aiter_ops.py - - vllm/envs.py - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/attention - -- label: V1 Core + KV + Metrics # TBD - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1/core - - tests/v1/executor - - tests/v1/kv_offload - - tests/v1/worker - - tests/v1/kv_connector/unit - - tests/v1/metrics - - tests/entrypoints/openai/correctness/test_lmeval.py - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - pytest -v -s -m 'not cpu_test' v1/core - - pytest -v -s v1/executor - - pytest -v -s v1/kv_offload - - pytest -v -s v1/worker - - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'not cpu_test' v1/metrics - - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - -- label: V1 Sample + Logits # TBD - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1/sample - - tests/v1/logits_processors - - tests/v1/test_oracle.py - - tests/v1/test_request.py - - tests/v1/test_outputs.py - commands: - - pytest -v -s v1/sample - - pytest -v -s v1/logits_processors - - pytest -v -s v1/test_oracle.py - - pytest -v -s v1/test_request.py - - pytest -v -s v1/test_outputs.py - -- label: Distributed DP Tests (2 GPUs) # TBD + +- label: Distributed DP Tests (2 GPUs) # TBD timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_2 @@ -865,88 +241,14 @@ steps: - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py -- label: NixlConnector PD + Spec Decode acceptance (2 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_2 - num_gpus: 2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - vllm/v1/worker/kv_connector_model_runner_mixin.py - - tests/v1/kv_connector/nixl_integration/ - - vllm/platforms/rocm.py - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh - -- label: V1 e2e (2 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_2 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1/e2e - commands: - - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" - -- label: Distributed NixlConnector PD accuracy (4 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_4 - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - - vllm/platforms/rocm.py - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh #------------------------------------------------------------- mi250 · misc ------------------------------------------------------------# -- label: Async Engine, Inputs, Utils, Worker, Config (CPU) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - no_gpu: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/test_inputs.py - - tests/test_outputs.py - - tests/test_pooling_params.py - - tests/test_ray_env.py - - tests/multimodal - - tests/renderers - - tests/standalone_tests/lazy_imports.py - - tests/tokenizers_ - - tests/tool_parsers - - tests/transformers_utils - - tests/config - commands: - - python3 standalone_tests/lazy_imports.py - - pytest -v -s test_inputs.py - - pytest -v -s test_outputs.py - - pytest -v -s test_pooling_params.py - - pytest -v -s test_ray_env.py - - pytest -v -s -m 'cpu_test' multimodal - - pytest -v -s renderers - - pytest -v -s tokenizers_ - - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py --ignore=reasoning/test_gemma4_reasoning_parser.py - - pytest -v -s tool_parsers - - pytest -v -s transformers_utils - - pytest -v -s config ######################################################################################################################################### # # @@ -956,206 +258,21 @@ steps: #----------------------------------------------------- mi300 · basic_correctness -----------------------------------------------------# -- label: Basic Correctness # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - fast_check: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/basic_correctness/test_basic_correctness - - tests/basic_correctness/test_cpu_offload - - tests/basic_correctness/test_cumem.py - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s basic_correctness/test_cumem.py - - pytest -v -s basic_correctness/test_basic_correctness.py - - pytest -v -s basic_correctness/test_cpu_offload.py - -- label: Distributed Model Tests (2 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 - num_gpus: 2 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/model_executor/model_loader/sharded_state_loader.py - - vllm/model_executor/models/ - - vllm/model_executor/layers/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - - tests/basic_correctness/ - - tests/model_executor/model_loader/test_sharded_state_loader.py - - tests/models/ - commands: - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py -m '(not slow_test)' - - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/language -v -s -m 'distributed(num_gpus=2)' - - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/generation/test_phi4siglip.py - - pytest models/multimodal/generation/test_phi4siglip.py -v -s -m 'distributed(num_gpus=2)' - - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' #-------------------------------------------------------- mi300 · benchmarks ---------------------------------------------------------# -- label: Benchmarks # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/.buildkite" - source_file_dependencies: - - benchmarks/ - - vllm/platforms/rocm.py - commands: - - bash scripts/run-benchmarks.sh - -- label: Benchmarks CLI Test # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/benchmarks/ - commands: - - pytest -v -s benchmarks/ #---------------------------------------------------------- mi300 · compile ----------------------------------------------------------# -- label: Fusion E2E Config Sweep (H100-MI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - num_gpus: 1 - working_dir: "/vllm-workspace/" - source_file_dependencies: - - csrc/quantization/ - - vllm/compilation/ - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/attention/attention.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/fusions_e2e/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - rocm-smi - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" - -- label: Fusion E2E Quick (H100-MI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - num_gpus: 1 - working_dir: "/vllm-workspace/" - source_file_dependencies: - - csrc/quantization/ - - vllm/model_executor/ - - vllm/v1/attention/ - - vllm/compilation/ - - tests/compile/fusions_e2e/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - rocm-smi - # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" - # Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER - - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'" - -- label: PyTorch Compilation Passes Unit Tests # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/compile/passes - commands: - - pytest -s -v compile/passes --ignore compile/passes/distributed - -- label: Pytorch Nightly Dependency Override Check # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - soft_fail: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - requirements/test/nightly-torch.txt - - vllm/platforms/rocm.py - commands: - - bash standalone_tests/pytorch_nightly_dependency.sh - -- label: Distributed Compile Unit Tests (2xH100-2xMI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 - num_gpus: 2 - working_dir: "/vllm-workspace/" - source_file_dependencies: - - vllm/compilation/ - - vllm/model_executor/layers - - tests/compile/passes/distributed/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py - - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py - - pytest -v -s tests/compile/passes/distributed/test_tp2_ar_rms.py::test_tp2_ar_rms_fusions #----------------------------------------------------------- mi300 · cuda ------------------------------------------------------------# -- label: Platform Tests (CUDA) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/cuda - commands: - - pytest -v -s cuda/test_cuda_context.py - - pytest -v -s cuda/test_platform_no_cuda_init.py #-------------------------------------------------------- mi300 · detokenizer --------------------------------------------------------# -- label: Async Engine, Inputs, Utils, Worker # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/detokenizer - - tests/multimodal - - tests/utils_ - commands: - - pytest -v -s detokenizer - - pytest -v -s -m 'not cpu_test' multimodal - - pytest -v -s utils_ #-------------------------------------------------------- mi300 · distributed --------------------------------------------------------# -- label: EPLB Algorithm # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/eplb - - tests/distributed/test_eplb_algo.py - - vllm/platforms/rocm.py - commands: - - pytest -v -s distributed/test_eplb_algo.py - - pytest -v -s distributed/test_eplb_utils.py - label: Distributed Tests (2xH100-2xMI250) # TBD timeout_in_minutes: 180 @@ -1174,7 +291,6 @@ steps: - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s tests/distributed/test_context_parallel.py - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization @@ -1188,7 +304,6 @@ steps: source_file_dependencies: - vllm/ commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s distributed/test_custom_all_reduce.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' @@ -1208,7 +323,6 @@ steps: - tests/examples/features/data_parallel/data_parallel_offline.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py @@ -1220,22 +334,6 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_ipc.py -- label: Elastic EP Scaling Test # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - num_gpus: 4 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/compilation/ - - tests/distributed/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s distributed/test_elastic_ep.py - label: RayExecutorV2 (4 GPUs) # TBD timeout_in_minutes: 180 @@ -1254,7 +352,6 @@ steps: - vllm/platforms/rocm.py commands: - export VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1 - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s distributed/test_ray_v2_executor.py - pytest -v -s distributed/test_ray_v2_executor_e2e.py - pytest -v -s distributed/test_pipeline_parallel.py -k "ray" @@ -1276,1947 +373,251 @@ steps: - vllm/v1/worker/gpu_worker.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep #-------------------------------------------------------- mi300 · entrypoints --------------------------------------------------------# -- label: Entrypoints Integration (API Server 2) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - fast_check: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/rpc - - tests/entrypoints/serve/instrumentator - - tests/tool_use - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/serve/instrumentator - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - - pytest -v -s tool_use -- label: Entrypoints Integration (API Server openai - Part 1) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - fast_check: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai - - tests/entrypoints/test_chat_utils - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py - -- label: Entrypoints Integration (API Server openai - Part 2) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - fast_check: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai - - tests/entrypoints/test_chat_utils - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - - pytest -v -s entrypoints/openai/speech_to_text/ - - pytest -v -s entrypoints/test_chat_utils.py - -- label: Entrypoints Integration (API Server openai - Part 3) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - fast_check: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai - - tests/entrypoints/test_chat_utils - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py - -- label: Entrypoints Integration (LLM) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - fast_check: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/llm - - tests/entrypoints/offline_mode - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_generate.py - - pytest -v -s entrypoints/offline_mode - -- label: Entrypoints Integration (Pooling) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - fast_check: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/pooling - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/pooling - -- label: Entrypoints Integration (Responses API) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - fast_check: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai/responses - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai/responses - -- label: Entrypoints Unit Tests # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - fast_check: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/entrypoints - - tests/entrypoints/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - -- label: OpenAI API correctness # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/ - - vllm/entrypoints/openai/ - - vllm/model_executor/models/whisper.py - - vllm/model_executor/layers/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - - vllm/model_executor/model_loader/ - commands: - - bash ../tools/install_torchcodec_rocm.sh || exit 1 - - pytest -s entrypoints/openai/correctness/ - -#----------------------------------------------------------- mi300 · evals -----------------------------------------------------------# - -- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100-MI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - num_gpus: 1 - working_dir: "/vllm-workspace" - source_file_dependencies: - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/model_executor/layers/fused_moe/ - - vllm/model_executor/layers/quantization/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/backends/mla/ - - vllm/v1/attention/selector.py - - .buildkite/scripts/scheduled_integration_test/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030 - -- label: LM Eval Small Models # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - -- label: LM Eval Small Models (MI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small-rocm.txt - -- label: GPQA Eval (GPT-OSS) (2xH100-2xMI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 - num_gpus: 2 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/model_executor/layers/fused_moe/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - - tests/evals/gpt_oss/ - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx942.txt - -- label: LM Eval Small Models (2xB200-2xMI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt - -- label: DeepSeek V2-Lite Accuracy (4xH100-4xMI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - num_gpus: 4 - optional: true - working_dir: "/vllm-workspace" - source_file_dependencies: - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/distributed/eplb - - vllm/model_executor/layers/fused_moe/ - - vllm/model_executor/layers/quantization/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/backends/mla/ - - vllm/v1/attention/selector.py - - .buildkite/scripts/scheduled_integration_test/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - -- label: LM Eval Large Models (4xA100-4xMI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - num_gpus: 4 - optional: true - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - -- label: Qwen3-30B-A3B-FP8-block Accuracy (4xH100-4xMI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - optional: true - working_dir: "/vllm-workspace" - source_file_dependencies: - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/model_executor/layers/quantization/ - - vllm/distributed/eplb - - vllm/model_executor/layers/fused_moe/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - .buildkite/scripts/scheduled_integration_test/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - -- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - num_gpus: 4 - optional: true - working_dir: "/vllm-workspace" - source_file_dependencies: - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/v1/spec_decode/ - - vllm/distributed/eplb - - vllm/model_executor/layers/fused_moe/ - - vllm/model_executor/layers/quantization/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - .buildkite/scripts/scheduled_integration_test/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 - -- label: LM Eval Large Models (8xH200-8xMI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_8 - optional: true - num_gpus: 8 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/model_executor/layers/quantization/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/model_executor/layers/layernorm.py - - csrc/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - - tests/evals/ - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx.txt - -#--------------------------------------------------------- mi300 · examples ----------------------------------------------------------# - -- label: Examples # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/examples" - source_file_dependencies: - - vllm/entrypoints - - vllm/multimodal - - examples/ - - vllm/platforms/rocm.py - commands: - - pip install tensorizer - # Basic - - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN - - python3 basic/offline_inference/generate.py --model facebook/opt-125m - - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 basic/offline_inference/classify.py - - python3 basic/offline_inference/embed.py - - python3 basic/offline_inference/score.py - # Multi-modal models - - python3 generate/multimodal/audio_language_offline.py --seed 0 - - python3 generate/multimodal/vision_language_offline.py --seed 0 - - python3 generate/multimodal/vision_language_multi_image_offline.py --seed 0 - - python3 generate/multimodal/encoder_decoder_multimodal_offline.py --model-type whisper --seed 0 - # Pooling models - - python3 pooling/embed/vision_embedding_offline.py --seed 0 - # Features demo - - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - -#---------------------------------------------------------- mi300 · kernels ----------------------------------------------------------# - -- label: Kernels Attention Test %N # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - parallelism: 2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/attention/ - - vllm/v1/attention - - vllm/model_executor/layers/attention - - tests/kernels/attention - - vllm/_aiter_ops.py - - vllm/envs.py - - vllm/platforms/rocm.py - commands: - - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - -- label: Kernels Core Operation Test # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/ - - tests/kernels/core - - tests/kernels/test_top_k_per_row.py - - tests/kernels/test_concat_mla_q.py - - vllm/model_executor/layers/rotary_embedding/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - pytest -v -s kernels/core --ignore=kernels/core/test_minimax_reduce_rms.py kernels/test_concat_mla_q.py kernels/test_top_k_per_row.py - -- label: Kernels MoE Test %N # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - parallelism: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/quantization/cutlass_w8a8/moe/ - - csrc/moe/ - - tests/kernels/moe - - vllm/model_executor/layers/fused_moe/ - - vllm/distributed/device_communicators/ - - vllm/envs.py - - vllm/config - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - -- label: Kernels Quantization Test %N # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - parallelism: 2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/quantization/ - - vllm/model_executor/layers/quantization - - tests/kernels/quantization - - tests/kernels/quantization/test_rocm_skinny_gemms.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - - vllm/model_executor/kernels/ - commands: - - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - -- label: Kernels FP8 MoE Test (2xH100-2xMI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/moe/ - - csrc/quantization/w8a8/cutlass/moe/ - - vllm/model_executor/layers/fused_moe/ - - tests/kernels/moe/test_deepep_moe.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - - vllm/envs.py - commands: - - pytest -v -s kernels/moe/test_deepep_moe.py - -#----------------------------------------------------------- mi300 · lora ------------------------------------------------------------# - -- label: LoRA TP (Distributed) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/lora - - tests/lora - - vllm/platforms/rocm.py - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True - - pytest -v -s -x lora/test_chatglm3_tp.py - - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_llm_with_multi_loras.py - - pytest -v -s -x lora/test_olmoe_tp.py - - pytest -v -s -x lora/test_gptoss_tp.py - - pytest -v -s -x lora/test_qwen35_densemodel_lora.py - -#---------------------------------------------------------- mi300 · models -----------------------------------------------------------# - -- label: Language Models Test (Extended Pooling) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/language/pooling - commands: - - pytest -v -s models/language/pooling -m 'not core_model' - -- label: Language Models Tests (Standard) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/language - commands: - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and (not slow_test)' - -- label: Multi-Modal Models (Extended Generation 1) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal/generation - - tests/models/multimodal/test_mapping.py - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py - - pytest -v -s models/multimodal/test_mapping.py - -- label: Multi-Modal Models (Extended Generation 2) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal/generation - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - -- label: Multi-Modal Models (Extended Generation 3) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal/generation - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - -- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - torch_nightly: true - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2" - - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model - -- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - torch_nightly: true - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal/generation - - tests/models/multimodal/test_mapping.py - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma" - - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model - -- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal/generation - - tests/models/multimodal/test_mapping.py - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/generation/test_memory_leak.py --ignore models/multimodal/processing - - pytest -v -s models/multimodal/generation/test_memory_leak.py -m core_model - - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model - -- label: Multi-Modal Processor # 1h 42m - timeout_in_minutes: 138 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal - - tests/models/registry.py - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing/test_tensor_schema.py - -- label: Multi-Modal Processor (CPU) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - no_gpu: true - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal - - tests/models/registry.py - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py - -- label: Quantized Models Test # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/model_executor/layers/quantization - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - - tests/models/quantization - - vllm/model_executor/model_loader/ - commands: - - pytest -v -s models/quantization - -- label: Transformers Nightly Models # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/" - source_file_dependencies: - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/multimodal/ - - vllm/model_executor/layers/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - - tests/models/ - - examples/ - commands: - - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py - - pytest -v -s tests/models/test_transformers.py - - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py - - python3 examples/basic/offline_inference/chat.py - - python3 examples/generate/multimodal/vision_language_offline.py --model-type qwen2_5_vl - - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/generate/multimodal/audio_language_offline.py --model-type whisper - -#------------------------------------------------------- mi300 · quantization --------------------------------------------------------# - -- label: Quantization # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - - tests/quantization - commands: - - # temporary install here since we need nightly, will move to requirements/test.in - # after torchao 0.12 release, and pin a working version of torchao nightly here - - # since torchao nightly is only compatible with torch nightly currently - # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now - # we can only upgrade after this is resolved - # TODO(jerryzh168): resolve the above comment - - uv pip install --system torchao==0.17.0 - - uv pip install --system conch-triton-kernels - - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - -#----------------------------------------------------------- mi300 · rocm ------------------------------------------------------------# - -- label: ROCm AITER Ops Test # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/_aiter_ops.py - - vllm/envs.py - - vllm/platforms/rocm.py - - tests/rocm/aiter/ - - vllm/v1/attention/backends/mla/rocm_aiter_mla.py - - vllm/v1/attention/selector.py - commands: - - pytest -v -s rocm/aiter/ - -#--------------------------------------------------------- mi300 · samplers ----------------------------------------------------------# - -- label: Samplers Test # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/model_executor/layers - - vllm/sampling_metadata.py - - vllm/v1/sample/ - - vllm/beam_search.py - - tests/samplers - - tests/conftest.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - pytest -v -s samplers - -#------------------------------------------------------------ mi300 · misc ------------------------------------------------------------# - -- label: Python-only Installation # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - tests/standalone_tests/python_only_compile.sh - - setup.py - - vllm/platforms/rocm.py - commands: - - bash standalone_tests/python_only_compile.sh - -- label: Regression # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/test_regression - commands: - - pip install modelscope - - pytest -v -s test_regression.py - -#--------------------------------------------------------- mi300 · ray_compat ---------------------------------------------------------# - -- label: Ray Dependency Compatibility Check # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/" - source_file_dependencies: - - requirements/ - - setup.py - - vllm/platforms/rocm.py - commands: - - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh - -#------------------------------------------------------------ mi300 · v1 -------------------------------------------------------------# - -- label: Acceptance Length Test (Large Models) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/spec_decode/ - - vllm/model_executor/models/mlp_speculator.py - - tests/v1/spec_decode/test_acceptance_length.py - - vllm/platforms/rocm.py - commands: - - export VLLM_ALLOW_INSECURE_SERIALIZATION=1 - - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test - -- label: e2e Core (1 GPU) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/ - - tests/v1/e2e/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py - -- label: e2e Scheduling (1 GPU) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/ - - tests/v1/e2e/general/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/e2e/general/test_async_scheduling.py - -- label: Engine (1 GPU) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/engine/ - - tests/v1/engine/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/engine/test_preprocess_error_handling.py - - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py - -- label: Spec Decode Draft Model # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/spec_decode/ - - vllm/v1/worker/gpu/spec_decode/ - - vllm/model_executor/model_loader/ - - vllm/v1/sample/ - - vllm/model_executor/layers/ - - tests/v1/e2e/spec_decode/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference" - -- label: Spec Decode Eagle # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/spec_decode/ - - vllm/v1/worker/gpu/spec_decode/ - - vllm/model_executor/model_loader/ - - vllm/v1/sample/ - - vllm/model_executor/layers/ - - tests/v1/e2e/spec_decode/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness" - -- label: Spec Decode Ngram + Suffix # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/spec_decode/ - - vllm/v1/worker/gpu/spec_decode/ - - vllm/model_executor/model_loader/ - - vllm/v1/sample/ - - vllm/model_executor/layers/ - - tests/v1/e2e/spec_decode/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix" - -- label: Spec Decode Speculators + MTP # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/spec_decode/ - - vllm/v1/worker/gpu/spec_decode/ - - vllm/model_executor/model_loader/ - - vllm/v1/sample/ - - vllm/model_executor/layers/ - - vllm/transformers_utils/configs/speculators/ - - tests/v1/e2e/spec_decode/ - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness" - -- label: V1 attention (H100-MI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention - - vllm/_aiter_ops.py - - vllm/envs.py - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/attention - -- label: V1 Core + KV + Metrics # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1/core - - tests/v1/executor - - tests/v1/kv_offload - - tests/v1/worker - - tests/v1/kv_connector/unit - - tests/v1/metrics - - tests/entrypoints/openai/correctness/test_lmeval.py - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - pytest -v -s -m 'not cpu_test' v1/core - - pytest -v -s v1/executor - - pytest -v -s v1/kv_offload - - pytest -v -s v1/worker - - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'not cpu_test' v1/metrics - - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - # - export HSA_NO_SCRATCH_RECLAIM=1 - - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - -- label: V1 others (CPU) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - no_gpu: true - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - pytest -v -s -m 'cpu_test' v1/core - - pytest -v -s v1/structured_output - - pytest -v -s v1/test_serial_utils.py - - pytest -v -s -m 'cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'cpu_test' v1/metrics - -- label: V1 Sample + Logits # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1/sample - - tests/v1/logits_processors - - tests/v1/test_oracle.py - - tests/v1/test_request.py - - tests/v1/test_outputs.py - commands: - - pytest -v -s v1/sample - - pytest -v -s v1/logits_processors - - pytest -v -s v1/test_oracle.py - - pytest -v -s v1/test_request.py - - pytest -v -s v1/test_outputs.py - -- label: Distributed DP Tests (2 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 - num_gpus: 2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/worker/worker_base.py - - vllm/v1/engine/ - - vllm/v1/worker/ - - tests/v1/distributed - - tests/entrypoints/openai/test_multi_api_servers.py - - vllm/platforms/rocm.py - commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py - -- label: Distributed Tests (2xH100-2xMI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 - num_gpus: 2 - working_dir: "/vllm-workspace/" - source_file_dependencies: - - vllm/distributed/ - - vllm/v1/distributed/ - - vllm/model_executor/layers/fused_moe/ - - tests/v1/distributed/test_dbo.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - - pytest -v -s tests/v1/distributed/test_dbo.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py - - pytest -v -s tests/distributed/test_packed_tensor.py - -- label: Metrics, Tracing (2 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 - num_gpus: 2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1/tracing - commands: - - "pip install \ - 'opentelemetry-sdk>=1.26.0' \ - 'opentelemetry-api>=1.26.0' \ - 'opentelemetry-exporter-otlp>=1.26.0' \ - 'opentelemetry-semantic-conventions-ai>=0.4.1'" - - pytest -v -s v1/tracing - -- label: V1 e2e (2 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1/e2e - commands: - - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" - -- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - - vllm/platforms/rocm.py - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - CROSS_LAYERS_BLOCKS=True ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -- label: Distributed DP Tests (4 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/ - - tests/v1/distributed - - tests/v1/engine/test_engine_core_client.py - - tests/distributed/test_utils - - vllm/platforms/rocm.py - commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - - pytest -v -s distributed/test_utils.py - -- label: Distributed NixlConnector PD accuracy (4 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - num_gpus: 4 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - - vllm/platforms/rocm.py - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - - vllm/platforms/rocm.py - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -- label: Hyrbid SSM NixlConnector PD accuracy tests (4 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - - vllm/platforms/rocm.py - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - HYBRID_SSM=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -- label: V1 e2e (4 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1/e2e - commands: - - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" - -- label: V1 e2e (4xH100-4xMI300) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 - optional: true - source_file_dependencies: - - vllm/v1/attention/backends/utils.py - - vllm/v1/worker/gpu_model_runner.py - - tests/v1/e2e/test_hybrid_chunked_prefill.py - commands: - - pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py - -#------------------------------------------------------ mi300 · weight_loading -------------------------------------------------------# - -- label: Weight Loading Multiple GPU # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 - num_gpus: 2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt - -- label: Weight Loading Multiple GPU - Large Models # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 - num_gpus: 2 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt - -######################################################################################################################################### -# # -# MI325 (gfx942) tests # -# # -######################################################################################################################################### - -#---------------------------------------------------------- mi325 · compile ----------------------------------------------------------# - -- label: Distributed Compile + RPC Tests (2 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_2 - num_gpus: 2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/compilation/ - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/worker/worker_base.py - - vllm/v1/engine/ - - vllm/v1/worker/ - - tests/compile/fullgraph/test_basic_correctness.py - - tests/compile/test_wrapper.py - - tests/entrypoints/llm/test_collective_rpc.py - - vllm/platforms/rocm.py - commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - - pytest -v -s ./compile/test_wrapper.py - -#-------------------------------------------------------- mi325 · distributed --------------------------------------------------------# - -- label: Distributed Torchrun + Shutdown Tests (2 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_2 - num_gpus: 2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/worker/worker_base.py - - vllm/v1/engine/ - - vllm/v1/worker/ - - tests/distributed/ - - tests/v1/shutdown - - tests/v1/worker/test_worker_memory_snapshot.py - - vllm/platforms/rocm.py - commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - - pytest -v -s v1/worker/test_worker_memory_snapshot.py - -- label: Distributed Compile + Comm (4 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_4 - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/ - - tests/distributed/test_pynccl - - tests/distributed/test_events - - tests/compile/fullgraph/test_basic_correctness.py - - tests/distributed/test_symm_mem_allreduce.py - - tests/distributed/test_multiproc_executor.py - - vllm/platforms/rocm.py - commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - - pytest -v -s compile/fullgraph/test_basic_correctness.py - - pytest -v -s distributed/test_pynccl.py - - pytest -v -s distributed/test_events.py - - pytest -v -s distributed/test_symm_mem_allreduce.py - - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node - -#---------------------------------------------------------- mi325 · engine -----------------------------------------------------------# - -- label: Engine # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/engine - - tests/test_sequence - - tests/test_config - - tests/test_logger - - tests/test_vllm_port - commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - -#----------------------------------------------------------- mi325 · evals -----------------------------------------------------------# - -- label: LM Eval Large Models (4xH100-4xMI325) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_4 - num_gpus: 4 - optional: true - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - export VLLM_USE_DEEP_GEMM=0 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm-fp8.txt --tp-size=4 - -- label: ROCm LM Eval Large Models (8 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_8 - optional: true - num_gpus: 8 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/model_executor/layers/quantization/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/model_executor/layers/layernorm.py - - csrc/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8 +#----------------------------------------------------------- mi300 · evals -----------------------------------------------------------# -#---------------------------------------------------------- mi325 · models -----------------------------------------------------------# -- label: Language Models Test (Extended Generation) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/language/generation - commands: - - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' - - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' +#--------------------------------------------------------- mi300 · examples ----------------------------------------------------------# -- label: Language Models Tests (Hybrid) %N # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_1 - torch_nightly: true - parallelism: 2 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/language/generation - commands: - - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' - - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB -- label: Multi-Modal Models (Extended Pooling) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal/pooling - commands: - - pytest -v -s models/multimodal/pooling -m 'not core_model' +#---------------------------------------------------------- mi300 · kernels ----------------------------------------------------------# -- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_1 - torch_nightly: true - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma" - - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model -#------------------------------------------------------------ mi325 · v1 -------------------------------------------------------------# +#----------------------------------------------------------- mi300 · lora ------------------------------------------------------------# -- label: V1 Spec Decode # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1/spec_decode - commands: - - pytest -v -s -m 'not slow_test' v1/spec_decode -######################################################################################################################################### -# # -# MI355 (gfx950) tests # -# # -######################################################################################################################################### +#---------------------------------------------------------- mi300 · models -----------------------------------------------------------# -#-------------------------------------------------------- mi355 · benchmarks ---------------------------------------------------------# -- label: Attention Benchmarks Smoke Test (B200-MI355) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_2 - num_gpus: 2 - working_dir: "/vllm-workspace/" - source_file_dependencies: - - benchmarks/attention_benchmarks/ - - vllm/v1/attention/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1 +#------------------------------------------------------- mi300 · quantization --------------------------------------------------------# -#-------------------------------------------------------- mi355 · distributed --------------------------------------------------------# -- label: Distributed Tests (2xH100-2xMI355) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_2 - num_gpus: 2 - optional: true - working_dir: "/vllm-workspace/" - source_file_dependencies: - - vllm/distributed/ - - vllm/v1/distributed/ - - vllm/model_executor/layers/fused_moe/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - tests/distributed/test_context_parallel.py - - tests/v1/distributed/test_dbo.py - - examples/features/data_parallel/data_parallel_offline.py - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - - pytest -v -s tests/distributed/test_context_parallel.py - - pytest -v -s tests/v1/distributed/test_dbo.py +#----------------------------------------------------------- mi300 · rocm ------------------------------------------------------------# -#-------------------------------------------------------- mi355 · entrypoints --------------------------------------------------------# -- label: Entrypoints Integration (API Server 2) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - optional: true - fast_check: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/rpc - - tests/entrypoints/serve/instrumentator - - tests/tool_use - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/serve/instrumentator - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - - pytest -v -s tool_use +#--------------------------------------------------------- mi300 · samplers ----------------------------------------------------------# -- label: Entrypoints Integration (API Server openai - Part 1) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - fast_check: true - torch_nightly: true - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai - - tests/entrypoints/test_chat_utils - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py -- label: Entrypoints Integration (API Server openai - Part 2) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - fast_check: true - torch_nightly: true - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai - - tests/entrypoints/test_chat_utils - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - - pytest -v -s entrypoints/openai/speech_to_text/ - - pytest -v -s entrypoints/test_chat_utils.py +#------------------------------------------------------------ mi300 · misc ------------------------------------------------------------# -- label: Entrypoints Integration (API Server openai - Part 3) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - fast_check: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai - - tests/entrypoints/test_chat_utils - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py -- label: Entrypoints Integration (Pooling) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - fast_check: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/pooling - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/pooling +#--------------------------------------------------------- mi300 · ray_compat ---------------------------------------------------------# -#----------------------------------------------------------- mi355 · evals -----------------------------------------------------------# -- label: GPQA Eval (GPT-OSS) (2xB200-2xMI355) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_2 - num_gpus: 2 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/model_executor/layers/fused_moe/ - - tests/evals/gpt_oss/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx950.txt +#------------------------------------------------------------ mi300 · v1 -------------------------------------------------------------# -- label: LM Eval Qwen3-5 Models (B200-MI355) # TBD - timeout_in_minutes: 120 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_2 - num_gpus: 2 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/model_executor/models/qwen3_5.py - - vllm/model_executor/models/qwen3_5_mtp.py - - vllm/transformers_utils/configs/qwen3_5.py - - vllm/transformers_utils/configs/qwen3_5_moe.py - - vllm/model_executor/models/qwen.py - - vllm/model_executor/models/qwen2.py - - vllm/model_executor/models/qwen3.py - - vllm/model_executor/models/qwen3_next.py - - vllm/model_executor/models/qwen3_next_mtp.py - - vllm/model_executor/layers/fla/ops/ - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-mi355.txt -- label: LM Eval Small Models (2xB200-2xMI355) # TBD +- label: Distributed DP Tests (2 GPUs) # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_2 - optional: true + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 + num_gpus: 2 working_dir: "/vllm-workspace/tests" source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/_aiter_ops.py + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/v1/distributed + - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py -- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355) # TBD +- label: Distributed Tests (2xH100-2xMI300) # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_2 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 num_gpus: 2 - working_dir: "/vllm-workspace" + working_dir: "/vllm-workspace/" source_file_dependencies: - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/model_executor/layers/quantization/ + - vllm/distributed/ + - vllm/v1/distributed/ - vllm/model_executor/layers/fused_moe/ - - vllm/distributed/eplb - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - .buildkite/scripts/scheduled_integration_test/ + - tests/v1/distributed/test_dbo.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - pytest -v -s tests/v1/distributed/test_dbo.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py + - pytest -v -s tests/distributed/test_packed_tensor.py + -- label: LM Eval Large Models (4xH100-4xMI355) # TBD +- label: Distributed DP Tests (4 GPUs) # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_4 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 num_gpus: 4 - optional: true - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - vllm/model_executor/models/ - - vllm/model_executor/model_loader/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - vllm/_aiter_ops.py + - vllm/distributed/ + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_utils - vllm/platforms/rocm.py commands: - - export VLLM_USE_DEEP_GEMM=0 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm-fp8.txt --tp-size=4 + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py -#--------------------------------------------------------- mi355 · examples ----------------------------------------------------------# -- label: Examples # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - working_dir: "/vllm-workspace/examples" - source_file_dependencies: - - vllm/entrypoints - - vllm/multimodal - - examples/ - - vllm/platforms/rocm.py - commands: - - pip install tensorizer - # Basic - - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN - - python3 basic/offline_inference/generate.py --model facebook/opt-125m - - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 basic/offline_inference/classify.py - - python3 basic/offline_inference/embed.py - - python3 basic/offline_inference/score.py - # Multi-modal models - - python3 generate/multimodal/audio_language_offline.py --seed 0 - - python3 generate/multimodal/vision_language_offline.py --seed 0 - - python3 generate/multimodal/vision_language_multi_image_offline.py --seed 0 - - python3 generate/multimodal/encoder_decoder_multimodal_offline.py --model-type whisper --seed 0 - # Pooling models - - python3 pooling/embed/vision_embedding_offline.py --seed 0 - # Features demo - - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 +#------------------------------------------------------ mi300 · weight_loading -------------------------------------------------------# -#---------------------------------------------------------- mi355 · kernels ----------------------------------------------------------# -- label: Kernels (B200-MI355) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - working_dir: "/vllm-workspace/" - source_file_dependencies: - - csrc/quantization/fp4/ - - csrc/attention/mla/ - - csrc/quantization/cutlass_w8a8/moe/ - - vllm/model_executor/layers/fused_moe/cutlass_moe.py - - vllm/v1/attention/backends/triton_attn.py - - vllm/v1/attention/backends/rocm_attn.py - - vllm/v1/attention/backends/rocm_aiter_fa.py - - vllm/v1/attention/backends/rocm_aiter_unified_attn.py - - vllm/v1/attention/backends/mla/aiter_triton_mla.py - - vllm/v1/attention/backends/mla/rocm_aiter_mla.py - - vllm/v1/attention/selector.py - - vllm/platforms/rocm.py - - vllm/_aiter_ops.py - commands: - - rocm-smi - - python3 examples/basic/offline_inference/chat.py - - pytest -v -s tests/kernels/attention/test_attention_selector.py +######################################################################################################################################### +# # +# MI325 (gfx942) tests # +# # +######################################################################################################################################### + +#---------------------------------------------------------- mi325 · compile ----------------------------------------------------------# -- label: Kernels Attention Test %N # TBD +- label: Distributed Compile + RPC Tests (2 GPUs) # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - parallelism: 2 - optional: true + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_2 + num_gpus: 2 working_dir: "/vllm-workspace/tests" source_file_dependencies: - - csrc/attention/ - - vllm/v1/attention - - vllm/model_executor/layers/attention - - tests/kernels/attention - - vllm/_aiter_ops.py - - vllm/envs.py + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/fullgraph/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/entrypoints/llm/test_collective_rpc.py - vllm/platforms/rocm.py commands: - - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + +#-------------------------------------------------------- mi325 · distributed --------------------------------------------------------# -- label: Kernels MoE Test %N # TBD +- label: Distributed Torchrun + Shutdown Tests (2 GPUs) # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - parallelism: 4 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_2 + num_gpus: 2 working_dir: "/vllm-workspace/tests" source_file_dependencies: - - csrc/quantization/cutlass_w8a8/moe/ - - csrc/moe/ - - tests/kernels/moe - - vllm/model_executor/layers/fused_moe/ - - vllm/distributed/device_communicators/ - - vllm/envs.py - - vllm/config - - vllm/_aiter_ops.py + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/distributed/ + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py - vllm/platforms/rocm.py commands: - - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py -- label: Kernels Quantization Test %N # TBD +- label: Distributed Compile + Comm (4 GPUs) # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - parallelism: 2 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_4 + num_gpus: 4 working_dir: "/vllm-workspace/tests" source_file_dependencies: - - csrc/quantization/ - - vllm/model_executor/layers/quantization - - tests/kernels/quantization - - tests/kernels/quantization/test_rocm_skinny_gemms.py - - vllm/_aiter_ops.py + - vllm/distributed/ + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/fullgraph/test_basic_correctness.py + - tests/distributed/test_symm_mem_allreduce.py + - tests/distributed/test_multiproc_executor.py - vllm/platforms/rocm.py - - vllm/model_executor/kernels/ commands: - - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node + +#---------------------------------------------------------- mi325 · engine -----------------------------------------------------------# + + +#----------------------------------------------------------- mi325 · evals -----------------------------------------------------------# + + +#---------------------------------------------------------- mi325 · models -----------------------------------------------------------# + + +#------------------------------------------------------------ mi325 · v1 -------------------------------------------------------------# + + +######################################################################################################################################### +# # +# MI355 (gfx950) tests # +# # +######################################################################################################################################### -- label: Kernels FP8 MoE Test (2xH100-2xMI355) # TBD +#-------------------------------------------------------- mi355 · benchmarks ---------------------------------------------------------# + + +#-------------------------------------------------------- mi355 · distributed --------------------------------------------------------# + +- label: Distributed Tests (2xH100-2xMI355) # TBD timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_2 - working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/" source_file_dependencies: - - csrc/moe/ - - csrc/quantization/w8a8/cutlass/moe/ + - vllm/distributed/ + - vllm/v1/distributed/ - vllm/model_executor/layers/fused_moe/ - - tests/kernels/moe/test_deepep_moe.py + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - tests/distributed/test_context_parallel.py + - tests/v1/distributed/test_dbo.py + - examples/features/data_parallel/data_parallel_offline.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py - - vllm/envs.py commands: - - pytest -v -s kernels/moe/test_deepep_moe.py - -#---------------------------------------------------------- mi355 · models -----------------------------------------------------------# + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/v1/distributed/test_dbo.py -- label: Language Models Test (Extended Generation) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/language/generation - commands: - - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' - - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' +#-------------------------------------------------------- mi355 · entrypoints --------------------------------------------------------# -- label: Language Models Test (Extended Pooling) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/language/pooling - commands: - - pytest -v -s models/language/pooling -m 'not core_model' -- label: Language Models Test (PPL) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/model_executor/models/qwen3_5.py - - vllm/model_executor/models/qwen3_5_mtp.py - - vllm/transformers_utils/configs/qwen3_5.py - - vllm/transformers_utils/configs/qwen3_5_moe.py - - vllm/model_executor/models/qwen.py - - vllm/model_executor/models/qwen2.py - - vllm/model_executor/models/qwen3.py - - vllm/model_executor/models/qwen3_next.py - - vllm/model_executor/models/qwen3_next_mtp.py - - vllm/model_executor/layers/fla/ops/ - - vllm/_aiter_ops.py - - vllm/v1/attention/backends/triton_attn.py - - vllm/v1/attention/backends/rocm_attn.py - - vllm/v1/attention/backends/rocm_aiter_unified_attn.py - - vllm/v1/attention/backends/rocm_aiter_fa.py - - vllm/v1/attention/backends/flex_attention.py - - vllm/v1/attention/ops/ - - vllm/platforms/rocm.py - - tests/models/language/generation_ppl_test - commands: - - pytest -v -s models/language/generation_ppl_test +#----------------------------------------------------------- mi355 · evals -----------------------------------------------------------# -- label: Language Models Tests (Standard) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/language - commands: - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and (not slow_test)' -- label: Multi-Modal Models (Extended Generation 1) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal/generation - - tests/models/multimodal/test_mapping.py - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py - - pytest -v -s models/multimodal/test_mapping.py +#--------------------------------------------------------- mi355 · examples ----------------------------------------------------------# -- label: Multi-Modal Models (Extended Generation 3) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal/generation - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' -- label: Multi-Modal Models (Extended Pooling) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal/pooling - commands: - - pytest -v -s models/multimodal/pooling -m 'not core_model' +#---------------------------------------------------------- mi355 · kernels ----------------------------------------------------------# -- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - torch_nightly: true - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2" - - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model -- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - torch_nightly: true - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/models/multimodal/generation - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/generation/test_memory_leak.py --ignore models/multimodal/processing - - pytest -v -s models/multimodal/generation/test_memory_leak.py -m core_model - - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model +#---------------------------------------------------------- mi355 · models -----------------------------------------------------------# -- label: Quantized Models Test # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/model_executor/layers/quantization - - tests/models/quantization - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - - vllm/model_executor/model_loader/ - commands: - - pytest -v -s models/quantization #------------------------------------------------------- mi355 · quantization --------------------------------------------------------# -- label: Quantization # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - tests/quantization - - vllm/_aiter_ops.py - - vllm/platforms/rocm.py - commands: - - uv pip install --system torchao==0.17.0 - - uv pip install --system conch-triton-kernels - - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py # - label: Quantized MoE Test (B200-MI355) # TBD # timeout_in_minutes: 180 @@ -3246,163 +647,9 @@ steps: #------------------------------------------------------------ mi355 · v1 -------------------------------------------------------------# -- label: V1 attention (B200-MI355) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention - - vllm/_aiter_ops.py - - vllm/envs.py - - vllm/platforms/rocm.py - commands: - - pytest -v -s v1/attention - -- label: V1 Core + KV + Metrics # TBD - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1/core - - tests/v1/executor - - tests/v1/kv_offload - - tests/v1/worker - - tests/v1/kv_connector/unit - - tests/v1/metrics - - tests/entrypoints/openai/correctness/test_lmeval.py - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - pytest -v -s -m 'not cpu_test' v1/core - - pytest -v -s v1/executor - - pytest -v -s v1/kv_offload - - pytest -v -s v1/worker - - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'not cpu_test' v1/metrics - - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - -- label: V1 Sample + Logits # TBD - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1/sample - - tests/v1/logits_processors - - tests/v1/test_oracle.py - - tests/v1/test_request.py - - tests/v1/test_outputs.py - commands: - - pytest -v -s v1/sample - - pytest -v -s v1/logits_processors - - pytest -v -s v1/test_oracle.py - - pytest -v -s v1/test_request.py - - pytest -v -s v1/test_outputs.py - -- label: V1 Spec Decode # TBD - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1/spec_decode - commands: - - pytest -v -s -m 'not slow_test' v1/spec_decode - -- label: NixlConnector PD + Spec Decode acceptance (2 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_2 - num_gpus: 2 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - vllm/v1/worker/kv_connector_model_runner_mixin.py - - tests/v1/kv_connector/nixl_integration/ - - vllm/platforms/rocm.py - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh - -- label: Distributed NixlConnector PD accuracy (4 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_4 - num_gpus: 4 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - - vllm/platforms/rocm.py - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_4 - num_gpus: 4 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - - vllm/platforms/rocm.py - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh #------------------------------------------------------ mi355 · weight_loading -------------------------------------------------------# -- label: Weight Loading Multiple GPU # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_2 - num_gpus: 2 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt - -- label: Weight Loading Multiple GPU - Large Models # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_2 - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - optional: true - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt #----------------------------------------------------------- mi355 · misc ------------------------------------------------------------# -- label: Regression # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] - agent_pool: mi355_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/test_regression - commands: - - pip install modelscope - - pytest -v -s test_regression.py From 57894bcd09ee48288ccc9ef6d61137af00debc5f Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Wed, 6 May 2026 16:28:21 +0000 Subject: [PATCH 2/3] Revert "check distributed test groups without TORCH_NCCL_BLOCKING_WAIT=1" This reverts commit c219c11096cb119690e9d375f33e33596b89e012. Signed-off-by: Micah Williamson --- .buildkite/test-amd.yaml | 3083 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 2918 insertions(+), 165 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index a6afcd5bac8d..5b3eb4f79c5d 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -114,12 +114,101 @@ steps: #----------------------------------------------------- mi250 · basic_correctness -----------------------------------------------------# +- label: Distributed Model Tests (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - vllm/model_executor/layers/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py -m '(not slow_test)' + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' #-------------------------------------------------------- mi250 · benchmarks ---------------------------------------------------------# +- label: Benchmarks # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + - vllm/platforms/rocm.py + commands: + - bash scripts/run-benchmarks.sh #---------------------------------------------------------- mi250 · compile ----------------------------------------------------------# +- label: PyTorch Compilation Unit Tests # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + torch_nightly: true + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/compilation/ + - vllm/model_executor/layers/ + - vllm/v1/worker/ + - vllm/v1/attention/ + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - csrc/ + - tests/compile + - vllm/platforms/rocm.py + commands: + - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" + +- label: PyTorch Fullgraph # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/compilation/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/config/compilation.py + - csrc/ + - tests/compile + - vllm/platforms/rocm.py + commands: + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + +- label: PyTorch Fullgraph Smoke Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/compilation/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/config/compilation.py + - csrc/ + - tests/compile + - vllm/platforms/rocm.py + commands: + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" - label: Distributed Compile + RPC Tests (2 GPUs) # TBD timeout_in_minutes: 180 @@ -141,12 +230,28 @@ steps: - tests/entrypoints/llm/test_collective_rpc.py - vllm/platforms/rocm.py commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py #-------------------------------------------------------- mi250 · distributed --------------------------------------------------------# +- label: Distributed Comm Ops # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed + - tests/distributed + - vllm/platforms/rocm.py + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py - label: Distributed Torchrun + Shutdown Tests (2 GPUs) # TBD timeout_in_minutes: 180 @@ -167,32 +272,303 @@ steps: - tests/v1/worker/test_worker_memory_snapshot.py - vllm/platforms/rocm.py commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py +- label: Elastic EP Scaling Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/compilation/ + - tests/distributed/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s distributed/test_elastic_ep.py + +- label: EPLB Execution # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + - tests/distributed/test_eplb_spec_decode.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py + +- label: Pipeline + Context Parallelism (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - vllm/model_executor/layers/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - tests/distributed/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py #---------------------------------------------------------- mi250 · engine -----------------------------------------------------------# +- label: Engine # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/engine + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py #----------------------------------------------------------- mi250 · evals -----------------------------------------------------------# +- label: Multi-Modal Accuracy Eval (Small Models) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + - vllm/platforms/rocm.py + - vllm/model_executor/model_loader/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 #--------------------------------------------------------- mi250 · examples ----------------------------------------------------------# +- label: Examples # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - vllm/multimodal + - examples/ + - vllm/platforms/rocm.py + commands: + - pip install tensorizer + # Basic + - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 basic/offline_inference/classify.py + - python3 basic/offline_inference/embed.py + - python3 basic/offline_inference/score.py + # Multi-modal models + - python3 generate/multimodal/audio_language_offline.py --seed 0 + - python3 generate/multimodal/vision_language_offline.py --seed 0 + - python3 generate/multimodal/vision_language_multi_image_offline.py --seed 0 + - python3 generate/multimodal/encoder_decoder_multimodal_offline.py --model-type whisper --seed 0 + # Pooling models + - python3 pooling/embed/vision_embedding_offline.py --seed 0 + # Features demo + - python3 features/automatic_prefix_caching/prefix_caching_offline.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi250 · kernels ----------------------------------------------------------# +- label: Kernels Core Operation Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/ + - tests/kernels/core + - tests/kernels/test_top_k_per_row.py + - tests/kernels/test_concat_mla_q.py + - vllm/model_executor/layers/rotary_embedding/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s kernels/core --ignore=kernels/core/test_minimax_reduce_rms.py kernels/test_concat_mla_q.py kernels/test_top_k_per_row.py + +- label: Kernels Helion Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/utils/import_utils.py + - tests/kernels/helion/ + - vllm/platforms/rocm.py + commands: + - pip install helion==1.0.0 + - pytest -v -s kernels/helion/ + +- label: Kernels Mamba Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops + - vllm/platforms/rocm.py + commands: + - pytest -v -s kernels/mamba #----------------------------------------------------------- mi250 · lora ------------------------------------------------------------# +- label: LoRA %N # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + parallelism: 4 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/lora + - tests/lora + - vllm/platforms/rocm.py + commands: + - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemodel_lora.py #------------------------------------------------------ mi250 · model_executor -------------------------------------------------------# +- label: Model Executor # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/engine/arg_utils.py + - vllm/config/model.py + - vllm/model_executor + - tests/model_executor + - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor -m '(not slow_test)' + - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py #---------------------------------------------------------- mi250 · models -----------------------------------------------------------# +- label: Basic Models Test (Other CPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + no_gpu: true + optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Basic Models Tests (Extra Initialization) %N # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + torch_nightly: true + parallelism: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/layers/ + - tests/models/test_initialization.py + - tests/models/registry.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB + +- label: Basic Models Tests (Initialization) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/test_initialization.py + - tests/models/registry.py + commands: + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Other) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/test_terratorch.py + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py + +- label: Language Models Test (MTEB) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test + +- label: Language Models Test (PPL) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test - label: Language Models Tests (Extra Standard) %N # TBD timeout_in_minutes: 180 @@ -214,16 +590,50 @@ steps: - vllm/platforms/rocm.py commands: - pip freeze | grep -E 'torch' + - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB +- label: Multi-Modal Models (Extended Generation 2) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' -#---------------------------------------------------------- mi250 · plugins ----------------------------------------------------------# - +- label: Multi-Modal Models (Extended Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/pooling + commands: + - pytest -v -s models/multimodal/pooling -m 'not core_model' -#------------------------------------------------------------ mi250 · v1 -------------------------------------------------------------# +- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + torch_nightly: true + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma" + - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model +#---------------------------------------------------------- mi250 · plugins ----------------------------------------------------------# -- label: Distributed DP Tests (2 GPUs) # TBD +- label: Plugin Tests (2 GPUs) # TBD timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_2 @@ -231,24 +641,312 @@ steps: optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/worker/worker_base.py - - vllm/v1/engine/ - - vllm/v1/worker/ + - vllm/plugins/ + - tests/plugins/ + - vllm/platforms/rocm.py + commands: + # BEGIN: platform plugin and general plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # END: platform plugin tests + # BEGIN: `io_processor` plugins test, all the code in between uses the `prithvi_io_processor` plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # END: `io_processor` plugins test + # BEGIN: `bge_m3_sparse io_processor` test + - pip install -e ./plugins/bge_m3_sparse_plugin + - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py + - pip uninstall bge_m3_sparse_plugin -y + # END: `bge_m3_sparse io_processor` test + # BEGIN: `stat_logger` plugins test + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + # END: `stat_logger` plugins test + # BEGIN: other tests + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py + - pytest -v -s models/test_oot_registration.py + - pytest -v -s plugins/lora_resolvers + +#------------------------------------------------------------ mi250 · v1 -------------------------------------------------------------# + +- label: Batch Invariance (H100-MI250) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/attention + - vllm/model_executor/layers + - tests/v1/determinism/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py + +- label: Cudagraph # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - tests/v1/cudagraph + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - vllm/compilation + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py + +- label: e2e Core (1 GPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/ + - tests/v1/e2e/general/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py + +- label: e2e Scheduling (1 GPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/ + - tests/v1/e2e/general/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/general/test_async_scheduling.py + +- label: Engine (1 GPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/ + - tests/v1/engine/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/engine/test_preprocess_error_handling.py + - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py + +- label: Spec Decode Draft Model # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference" + +- label: Spec Decode Speculators + MTP # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - vllm/transformers_utils/configs/speculators/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness" + +- label: V1 attention (H100-MI250) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/config/attention.py + - vllm/model_executor/layers/attention + - vllm/v1/attention + - tests/v1/attention + - vllm/_aiter_ops.py + - vllm/envs.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/attention + +- label: V1 Core + KV + Metrics # TBD + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/core + - tests/v1/executor + - tests/v1/kv_offload + - tests/v1/worker + - tests/v1/kv_connector/unit + - tests/v1/metrics + - tests/entrypoints/openai/correctness/test_lmeval.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/worker + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: V1 Sample + Logits # TBD + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/sample + - tests/v1/logits_processors + - tests/v1/test_oracle.py + - tests/v1/test_request.py + - tests/v1/test_outputs.py + commands: + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + +- label: Distributed DP Tests (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ - tests/v1/distributed - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py +- label: NixlConnector PD + Spec Decode acceptance (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/v1/worker/kv_connector_model_runner_mixin.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh + +- label: V1 e2e (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/e2e + commands: + - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" + +- label: Distributed NixlConnector PD accuracy (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh #------------------------------------------------------------- mi250 · misc ------------------------------------------------------------# +- label: Async Engine, Inputs, Utils, Worker, Config (CPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + no_gpu: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/test_pooling_params.py + - tests/test_ray_env.py + - tests/multimodal + - tests/renderers + - tests/standalone_tests/lazy_imports.py + - tests/tokenizers_ + - tests/tool_parsers + - tests/transformers_utils + - tests/config + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s test_pooling_params.py + - pytest -v -s test_ray_env.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s renderers + - pytest -v -s tokenizers_ + - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py --ignore=reasoning/test_gemma4_reasoning_parser.py + - pytest -v -s tool_parsers + - pytest -v -s transformers_utils + - pytest -v -s config ######################################################################################################################################### # # @@ -258,21 +956,206 @@ steps: #----------------------------------------------------- mi300 · basic_correctness -----------------------------------------------------# +- label: Basic Correctness # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py + +- label: Distributed Model Tests (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - vllm/model_executor/layers/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py -m '(not slow_test)' + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/generation/test_phi4siglip.py + - pytest models/multimodal/generation/test_phi4siglip.py -v -s -m 'distributed(num_gpus=2)' + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' #-------------------------------------------------------- mi300 · benchmarks ---------------------------------------------------------# +- label: Benchmarks # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + - vllm/platforms/rocm.py + commands: + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ #---------------------------------------------------------- mi300 · compile ----------------------------------------------------------# +- label: Fusion E2E Config Sweep (H100-MI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + num_gpus: 1 + working_dir: "/vllm-workspace/" + source_file_dependencies: + - csrc/quantization/ + - vllm/compilation/ + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/fusions_e2e/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - rocm-smi + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" + +- label: Fusion E2E Quick (H100-MI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + num_gpus: 1 + working_dir: "/vllm-workspace/" + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/compilation/ + - tests/compile/fusions_e2e/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - rocm-smi + # Run all models and attn backends but only Inductor partition and native custom ops + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" + # Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'" + +- label: PyTorch Compilation Passes Unit Tests # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/compile/passes + commands: + - pytest -s -v compile/passes --ignore compile/passes/distributed + +- label: Pytorch Nightly Dependency Override Check # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + soft_fail: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - requirements/test/nightly-torch.txt + - vllm/platforms/rocm.py + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh + +- label: Distributed Compile Unit Tests (2xH100-2xMI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 + num_gpus: 2 + working_dir: "/vllm-workspace/" + source_file_dependencies: + - vllm/compilation/ + - vllm/model_executor/layers + - tests/compile/passes/distributed/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py + - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/passes/distributed/test_tp2_ar_rms.py::test_tp2_ar_rms_fusions #----------------------------------------------------------- mi300 · cuda ------------------------------------------------------------# +- label: Platform Tests (CUDA) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + - pytest -v -s cuda/test_platform_no_cuda_init.py #-------------------------------------------------------- mi300 · detokenizer --------------------------------------------------------# +- label: Async Engine, Inputs, Utils, Worker # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/detokenizer + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s detokenizer + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ #-------------------------------------------------------- mi300 · distributed --------------------------------------------------------# +- label: EPLB Algorithm # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + - pytest -v -s distributed/test_eplb_utils.py - label: Distributed Tests (2xH100-2xMI250) # TBD timeout_in_minutes: 180 @@ -291,6 +1174,7 @@ steps: - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s tests/distributed/test_context_parallel.py - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization @@ -304,6 +1188,7 @@ steps: source_file_dependencies: - vllm/ commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s distributed/test_custom_all_reduce.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' @@ -323,6 +1208,7 @@ steps: - tests/examples/features/data_parallel/data_parallel_offline.py - vllm/platforms/rocm.py commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py @@ -334,6 +1220,22 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_ipc.py +- label: Elastic EP Scaling Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + num_gpus: 4 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/compilation/ + - tests/distributed/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s distributed/test_elastic_ep.py - label: RayExecutorV2 (4 GPUs) # TBD timeout_in_minutes: 180 @@ -352,6 +1254,7 @@ steps: - vllm/platforms/rocm.py commands: - export VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1 + - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s distributed/test_ray_v2_executor.py - pytest -v -s distributed/test_ray_v2_executor_e2e.py - pytest -v -s distributed/test_pipeline_parallel.py -k "ray" @@ -373,251 +1276,1947 @@ steps: - vllm/v1/worker/gpu_worker.py - vllm/platforms/rocm.py commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep #-------------------------------------------------------- mi300 · entrypoints --------------------------------------------------------# +- label: Entrypoints Integration (API Server 2) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/rpc + - tests/entrypoints/serve/instrumentator + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/serve/instrumentator + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s tool_use + +- label: Entrypoints Integration (API Server openai - Part 1) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py + +- label: Entrypoints Integration (API Server openai - Part 2) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py + - pytest -v -s entrypoints/openai/speech_to_text/ + - pytest -v -s entrypoints/test_chat_utils.py + +- label: Entrypoints Integration (API Server openai - Part 3) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py + +- label: Entrypoints Integration (LLM) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py + - pytest -v -s entrypoints/offline_mode + +- label: Entrypoints Integration (Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + +- label: Entrypoints Integration (Responses API) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai/responses + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai/responses + +- label: Entrypoints Unit Tests # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + fast_check: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: OpenAI API correctness # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + - vllm/model_executor/layers/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - vllm/model_executor/model_loader/ + commands: + - bash ../tools/install_torchcodec_rocm.sh || exit 1 + - pytest -s entrypoints/openai/correctness/ + +#----------------------------------------------------------- mi300 · evals -----------------------------------------------------------# + +- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100-MI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + num_gpus: 1 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/model_executor/layers/fused_moe/ + - vllm/model_executor/layers/quantization/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/backends/mla/ + - vllm/v1/attention/selector.py + - .buildkite/scripts/scheduled_integration_test/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030 + +- label: LM Eval Small Models # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt + +- label: LM Eval Small Models (MI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small-rocm.txt + +- label: GPQA Eval (GPT-OSS) (2xH100-2xMI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/model_executor/layers/fused_moe/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - tests/evals/gpt_oss/ + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx942.txt + +- label: LM Eval Small Models (2xB200-2xMI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt + +- label: DeepSeek V2-Lite Accuracy (4xH100-4xMI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + num_gpus: 4 + optional: true + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/distributed/eplb + - vllm/model_executor/layers/fused_moe/ + - vllm/model_executor/layers/quantization/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/backends/mla/ + - vllm/v1/attention/selector.py + - .buildkite/scripts/scheduled_integration_test/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 + +- label: LM Eval Large Models (4xA100-4xMI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + num_gpus: 4 + optional: true + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +- label: Qwen3-30B-A3B-FP8-block Accuracy (4xH100-4xMI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + optional: true + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/model_executor/layers/quantization/ + - vllm/distributed/eplb + - vllm/model_executor/layers/fused_moe/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - .buildkite/scripts/scheduled_integration_test/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + num_gpus: 4 + optional: true + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/spec_decode/ + - vllm/distributed/eplb + - vllm/model_executor/layers/fused_moe/ + - vllm/model_executor/layers/quantization/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - .buildkite/scripts/scheduled_integration_test/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + +- label: LM Eval Large Models (8xH200-8xMI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_8 + optional: true + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/model_executor/layers/quantization/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/model_executor/layers/layernorm.py + - csrc/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - tests/evals/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx.txt + +#--------------------------------------------------------- mi300 · examples ----------------------------------------------------------# + +- label: Examples # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - vllm/multimodal + - examples/ + - vllm/platforms/rocm.py + commands: + - pip install tensorizer + # Basic + - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 basic/offline_inference/classify.py + - python3 basic/offline_inference/embed.py + - python3 basic/offline_inference/score.py + # Multi-modal models + - python3 generate/multimodal/audio_language_offline.py --seed 0 + - python3 generate/multimodal/vision_language_offline.py --seed 0 + - python3 generate/multimodal/vision_language_multi_image_offline.py --seed 0 + - python3 generate/multimodal/encoder_decoder_multimodal_offline.py --model-type whisper --seed 0 + # Pooling models + - python3 pooling/embed/vision_embedding_offline.py --seed 0 + # Features demo + - python3 features/automatic_prefix_caching/prefix_caching_offline.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + +#---------------------------------------------------------- mi300 · kernels ----------------------------------------------------------# + +- label: Kernels Attention Test %N # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + parallelism: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/attention/ + - vllm/v1/attention + - vllm/model_executor/layers/attention + - tests/kernels/attention + - vllm/_aiter_ops.py + - vllm/envs.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + +- label: Kernels Core Operation Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/ + - tests/kernels/core + - tests/kernels/test_top_k_per_row.py + - tests/kernels/test_concat_mla_q.py + - vllm/model_executor/layers/rotary_embedding/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s kernels/core --ignore=kernels/core/test_minimax_reduce_rms.py kernels/test_concat_mla_q.py kernels/test_top_k_per_row.py + +- label: Kernels MoE Test %N # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + parallelism: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + +- label: Kernels Quantization Test %N # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + parallelism: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + - tests/kernels/quantization/test_rocm_skinny_gemms.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - vllm/model_executor/kernels/ + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + +- label: Kernels FP8 MoE Test (2xH100-2xMI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/moe/ + - csrc/quantization/w8a8/cutlass/moe/ + - vllm/model_executor/layers/fused_moe/ + - tests/kernels/moe/test_deepep_moe.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - vllm/envs.py + commands: + - pytest -v -s kernels/moe/test_deepep_moe.py + +#----------------------------------------------------------- mi300 · lora ------------------------------------------------------------# + +- label: LoRA TP (Distributed) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/lora + - tests/lora + - vllm/platforms/rocm.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + - pytest -v -s -x lora/test_gptoss_tp.py + - pytest -v -s -x lora/test_qwen35_densemodel_lora.py + +#---------------------------------------------------------- mi300 · models -----------------------------------------------------------# + +- label: Language Models Test (Extended Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + +- label: Language Models Tests (Standard) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Multi-Modal Models (Extended Generation 1) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + - tests/models/multimodal/test_mapping.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py + - pytest -v -s models/multimodal/test_mapping.py + +- label: Multi-Modal Models (Extended Generation 2) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models (Extended Generation 3) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + +- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + torch_nightly: true + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2" + - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model + +- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + torch_nightly: true + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + - tests/models/multimodal/test_mapping.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma" + - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model + +- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + - tests/models/multimodal/test_mapping.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/generation/test_memory_leak.py --ignore models/multimodal/processing + - pytest -v -s models/multimodal/generation/test_memory_leak.py -m core_model + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model + +- label: Multi-Modal Processor # 1h 42m + timeout_in_minutes: 138 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal + - tests/models/registry.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing/test_tensor_schema.py + +- label: Multi-Modal Processor (CPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + no_gpu: true + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal + - tests/models/registry.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + +- label: Quantized Models Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/model_executor/layers/quantization + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - tests/models/quantization + - vllm/model_executor/model_loader/ + commands: + - pytest -v -s models/quantization + +- label: Transformers Nightly Models # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/multimodal/ + - vllm/model_executor/layers/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - tests/models/ + - examples/ + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py + - python3 examples/basic/offline_inference/chat.py + - python3 examples/generate/multimodal/vision_language_offline.py --model-type qwen2_5_vl + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/generate/multimodal/audio_language_offline.py --model-type whisper + +#------------------------------------------------------- mi300 · quantization --------------------------------------------------------# + +- label: Quantization # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - tests/quantization + commands: + + # temporary install here since we need nightly, will move to requirements/test.in + # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.17.0 + - uv pip install --system conch-triton-kernels + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py + +#----------------------------------------------------------- mi300 · rocm ------------------------------------------------------------# + +- label: ROCm AITER Ops Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/_aiter_ops.py + - vllm/envs.py + - vllm/platforms/rocm.py + - tests/rocm/aiter/ + - vllm/v1/attention/backends/mla/rocm_aiter_mla.py + - vllm/v1/attention/selector.py + commands: + - pytest -v -s rocm/aiter/ + +#--------------------------------------------------------- mi300 · samplers ----------------------------------------------------------# + +- label: Samplers Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - vllm/v1/sample/ + - vllm/beam_search.py + - tests/samplers + - tests/conftest.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s samplers + +#------------------------------------------------------------ mi300 · misc ------------------------------------------------------------# + +- label: Python-only Installation # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + - vllm/platforms/rocm.py + commands: + - bash standalone_tests/python_only_compile.sh + +- label: Regression # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + +#--------------------------------------------------------- mi300 · ray_compat ---------------------------------------------------------# + +- label: Ray Dependency Compatibility Check # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/" + source_file_dependencies: + - requirements/ + - setup.py + - vllm/platforms/rocm.py + commands: + - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh + +#------------------------------------------------------------ mi300 · v1 -------------------------------------------------------------# + +- label: Acceptance Length Test (Large Models) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/model_executor/models/mlp_speculator.py + - tests/v1/spec_decode/test_acceptance_length.py + - vllm/platforms/rocm.py + commands: + - export VLLM_ALLOW_INSECURE_SERIALIZATION=1 + - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test + +- label: e2e Core (1 GPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/ + - tests/v1/e2e/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py + +- label: e2e Scheduling (1 GPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/ + - tests/v1/e2e/general/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/general/test_async_scheduling.py + +- label: Engine (1 GPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/engine/ + - tests/v1/engine/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/engine/test_preprocess_error_handling.py + - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py + +- label: Spec Decode Draft Model # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference" + +- label: Spec Decode Eagle # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness" + +- label: Spec Decode Ngram + Suffix # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix" + +- label: Spec Decode Speculators + MTP # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - vllm/transformers_utils/configs/speculators/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness" + +- label: V1 attention (H100-MI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/config/attention.py + - vllm/model_executor/layers/attention + - vllm/v1/attention + - tests/v1/attention + - vllm/_aiter_ops.py + - vllm/envs.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/attention + +- label: V1 Core + KV + Metrics # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/core + - tests/v1/executor + - tests/v1/kv_offload + - tests/v1/worker + - tests/v1/kv_connector/unit + - tests/v1/metrics + - tests/entrypoints/openai/correctness/test_lmeval.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/worker + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + # - export HSA_NO_SCRATCH_RECLAIM=1 + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: V1 others (CPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + no_gpu: true + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + +- label: V1 Sample + Logits # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/sample + - tests/v1/logits_processors + - tests/v1/test_oracle.py + - tests/v1/test_request.py + - tests/v1/test_outputs.py + commands: + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + +- label: Distributed DP Tests (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/v1/distributed + - tests/entrypoints/openai/test_multi_api_servers.py + - vllm/platforms/rocm.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py + +- label: Distributed Tests (2xH100-2xMI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 + num_gpus: 2 + working_dir: "/vllm-workspace/" + source_file_dependencies: + - vllm/distributed/ + - vllm/v1/distributed/ + - vllm/model_executor/layers/fused_moe/ + - tests/v1/distributed/test_dbo.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - pytest -v -s tests/v1/distributed/test_dbo.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py + - pytest -v -s tests/distributed/test_packed_tensor.py + +- label: Metrics, Tracing (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/tracing + commands: + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" + - pytest -v -s v1/tracing + +- label: V1 e2e (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/e2e + commands: + - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" + +- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - CROSS_LAYERS_BLOCKS=True ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +- label: Distributed DP Tests (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_utils + - vllm/platforms/rocm.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + +- label: Distributed NixlConnector PD accuracy (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + num_gpus: 4 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +- label: Hyrbid SSM NixlConnector PD accuracy tests (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - HYBRID_SSM=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +- label: V1 e2e (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/e2e + commands: + - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" -#----------------------------------------------------------- mi300 · evals -----------------------------------------------------------# - +- label: V1 e2e (4xH100-4xMI300) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_4 + optional: true + source_file_dependencies: + - vllm/v1/attention/backends/utils.py + - vllm/v1/worker/gpu_model_runner.py + - tests/v1/e2e/test_hybrid_chunked_prefill.py + commands: + - pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py + +#------------------------------------------------------ mi300 · weight_loading -------------------------------------------------------# + +- label: Weight Loading Multiple GPU # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt + +- label: Weight Loading Multiple GPU - Large Models # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt + +######################################################################################################################################### +# # +# MI325 (gfx942) tests # +# # +######################################################################################################################################### + +#---------------------------------------------------------- mi325 · compile ----------------------------------------------------------# + +- label: Distributed Compile + RPC Tests (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/fullgraph/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/entrypoints/llm/test_collective_rpc.py + - vllm/platforms/rocm.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + +#-------------------------------------------------------- mi325 · distributed --------------------------------------------------------# + +- label: Distributed Torchrun + Shutdown Tests (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/distributed/ + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + - vllm/platforms/rocm.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Compile + Comm (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/fullgraph/test_basic_correctness.py + - tests/distributed/test_symm_mem_allreduce.py + - tests/distributed/test_multiproc_executor.py + - vllm/platforms/rocm.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node + +#---------------------------------------------------------- mi325 · engine -----------------------------------------------------------# + +- label: Engine # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/engine + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + +#----------------------------------------------------------- mi325 · evals -----------------------------------------------------------# + +- label: LM Eval Large Models (4xH100-4xMI325) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_4 + num_gpus: 4 + optional: true + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - export VLLM_USE_DEEP_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm-fp8.txt --tp-size=4 -#--------------------------------------------------------- mi300 · examples ----------------------------------------------------------# +- label: ROCm LM Eval Large Models (8 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_8 + optional: true + num_gpus: 8 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/model_executor/layers/quantization/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/model_executor/layers/layernorm.py + - csrc/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8 +#---------------------------------------------------------- mi325 · models -----------------------------------------------------------# -#---------------------------------------------------------- mi300 · kernels ----------------------------------------------------------# +- label: Language Models Test (Extended Generation) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' +- label: Language Models Tests (Hybrid) %N # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + torch_nightly: true + parallelism: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' + - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB -#----------------------------------------------------------- mi300 · lora ------------------------------------------------------------# +- label: Multi-Modal Models (Extended Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/pooling + commands: + - pytest -v -s models/multimodal/pooling -m 'not core_model' +- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + torch_nightly: true + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma" + - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model -#---------------------------------------------------------- mi300 · models -----------------------------------------------------------# +#------------------------------------------------------------ mi325 · v1 -------------------------------------------------------------# +- label: V1 Spec Decode # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/spec_decode + commands: + - pytest -v -s -m 'not slow_test' v1/spec_decode -#------------------------------------------------------- mi300 · quantization --------------------------------------------------------# +######################################################################################################################################### +# # +# MI355 (gfx950) tests # +# # +######################################################################################################################################### +#-------------------------------------------------------- mi355 · benchmarks ---------------------------------------------------------# -#----------------------------------------------------------- mi300 · rocm ------------------------------------------------------------# +- label: Attention Benchmarks Smoke Test (B200-MI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + working_dir: "/vllm-workspace/" + source_file_dependencies: + - benchmarks/attention_benchmarks/ + - vllm/v1/attention/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1 +#-------------------------------------------------------- mi355 · distributed --------------------------------------------------------# -#--------------------------------------------------------- mi300 · samplers ----------------------------------------------------------# +- label: Distributed Tests (2xH100-2xMI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/" + source_file_dependencies: + - vllm/distributed/ + - vllm/v1/distributed/ + - vllm/model_executor/layers/fused_moe/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - tests/distributed/test_context_parallel.py + - tests/v1/distributed/test_dbo.py + - examples/features/data_parallel/data_parallel_offline.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/v1/distributed/test_dbo.py +#-------------------------------------------------------- mi355 · entrypoints --------------------------------------------------------# -#------------------------------------------------------------ mi300 · misc ------------------------------------------------------------# +- label: Entrypoints Integration (API Server 2) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + optional: true + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/rpc + - tests/entrypoints/serve/instrumentator + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/serve/instrumentator + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s tool_use +- label: Entrypoints Integration (API Server openai - Part 1) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + fast_check: true + torch_nightly: true + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py -#--------------------------------------------------------- mi300 · ray_compat ---------------------------------------------------------# +- label: Entrypoints Integration (API Server openai - Part 2) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + fast_check: true + torch_nightly: true + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py + - pytest -v -s entrypoints/openai/speech_to_text/ + - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration (API Server openai - Part 3) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py -#------------------------------------------------------------ mi300 · v1 -------------------------------------------------------------# +- label: Entrypoints Integration (Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling +#----------------------------------------------------------- mi355 · evals -----------------------------------------------------------# -- label: Distributed DP Tests (2 GPUs) # TBD +- label: GPQA Eval (GPT-OSS) (2xB200-2xMI355) # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 num_gpus: 2 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/worker/worker_base.py - - vllm/v1/engine/ - - vllm/v1/worker/ - - tests/v1/distributed - - tests/entrypoints/openai/test_multi_api_servers.py + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/model_executor/layers/fused_moe/ + - tests/evals/gpt_oss/ + - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx950.txt -- label: Distributed Tests (2xH100-2xMI300) # TBD +- label: LM Eval Qwen3-5 Models (B200-MI355) # TBD + timeout_in_minutes: 120 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/model_executor/models/qwen3_5.py + - vllm/model_executor/models/qwen3_5_mtp.py + - vllm/transformers_utils/configs/qwen3_5.py + - vllm/transformers_utils/configs/qwen3_5_moe.py + - vllm/model_executor/models/qwen.py + - vllm/model_executor/models/qwen2.py + - vllm/model_executor/models/qwen3.py + - vllm/model_executor/models/qwen3_next.py + - vllm/model_executor/models/qwen3_next_mtp.py + - vllm/model_executor/layers/fla/ops/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-mi355.txt + +- label: LM Eval Small Models (2xB200-2xMI355) # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_2 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt + +- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 num_gpus: 2 - working_dir: "/vllm-workspace/" + working_dir: "/vllm-workspace" source_file_dependencies: - - vllm/distributed/ - - vllm/v1/distributed/ + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/model_executor/layers/quantization/ - vllm/model_executor/layers/fused_moe/ - - tests/v1/distributed/test_dbo.py + - vllm/distributed/eplb + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - .buildkite/scripts/scheduled_integration_test/ - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - - pytest -v -s tests/v1/distributed/test_dbo.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py - - pytest -v -s tests/distributed/test_packed_tensor.py - + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 -- label: Distributed DP Tests (4 GPUs) # TBD +- label: LM Eval Large Models (4xH100-4xMI355) # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] - agent_pool: mi300_4 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_4 num_gpus: 4 - working_dir: "/vllm-workspace/tests" + optional: true + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - - vllm/distributed/ - - tests/v1/distributed - - tests/v1/engine/test_engine_core_client.py - - tests/distributed/test_utils + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - - pytest -v -s distributed/test_utils.py + - export VLLM_USE_DEEP_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm-fp8.txt --tp-size=4 +#--------------------------------------------------------- mi355 · examples ----------------------------------------------------------# -#------------------------------------------------------ mi300 · weight_loading -------------------------------------------------------# - +- label: Examples # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - vllm/multimodal + - examples/ + - vllm/platforms/rocm.py + commands: + - pip install tensorizer + # Basic + - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 basic/offline_inference/classify.py + - python3 basic/offline_inference/embed.py + - python3 basic/offline_inference/score.py + # Multi-modal models + - python3 generate/multimodal/audio_language_offline.py --seed 0 + - python3 generate/multimodal/vision_language_offline.py --seed 0 + - python3 generate/multimodal/vision_language_multi_image_offline.py --seed 0 + - python3 generate/multimodal/encoder_decoder_multimodal_offline.py --model-type whisper --seed 0 + # Pooling models + - python3 pooling/embed/vision_embedding_offline.py --seed 0 + # Features demo + - python3 features/automatic_prefix_caching/prefix_caching_offline.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 -######################################################################################################################################### -# # -# MI325 (gfx942) tests # -# # -######################################################################################################################################### +#---------------------------------------------------------- mi355 · kernels ----------------------------------------------------------# -#---------------------------------------------------------- mi325 · compile ----------------------------------------------------------# +- label: Kernels (B200-MI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + working_dir: "/vllm-workspace/" + source_file_dependencies: + - csrc/quantization/fp4/ + - csrc/attention/mla/ + - csrc/quantization/cutlass_w8a8/moe/ + - vllm/model_executor/layers/fused_moe/cutlass_moe.py + - vllm/v1/attention/backends/triton_attn.py + - vllm/v1/attention/backends/rocm_attn.py + - vllm/v1/attention/backends/rocm_aiter_fa.py + - vllm/v1/attention/backends/rocm_aiter_unified_attn.py + - vllm/v1/attention/backends/mla/aiter_triton_mla.py + - vllm/v1/attention/backends/mla/rocm_aiter_mla.py + - vllm/v1/attention/selector.py + - vllm/platforms/rocm.py + - vllm/_aiter_ops.py + commands: + - rocm-smi + - python3 examples/basic/offline_inference/chat.py + - pytest -v -s tests/kernels/attention/test_attention_selector.py -- label: Distributed Compile + RPC Tests (2 GPUs) # TBD +- label: Kernels Attention Test %N # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_2 - num_gpus: 2 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + parallelism: 2 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/compilation/ - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/worker/worker_base.py - - vllm/v1/engine/ - - vllm/v1/worker/ - - tests/compile/fullgraph/test_basic_correctness.py - - tests/compile/test_wrapper.py - - tests/entrypoints/llm/test_collective_rpc.py + - csrc/attention/ + - vllm/v1/attention + - vllm/model_executor/layers/attention + - tests/kernels/attention + - vllm/_aiter_ops.py + - vllm/envs.py - vllm/platforms/rocm.py commands: - - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - - pytest -v -s ./compile/test_wrapper.py - -#-------------------------------------------------------- mi325 · distributed --------------------------------------------------------# + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT -- label: Distributed Torchrun + Shutdown Tests (2 GPUs) # TBD +- label: Kernels MoE Test %N # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_2 - num_gpus: 2 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + parallelism: 4 working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/worker/worker_base.py - - vllm/v1/engine/ - - vllm/v1/worker/ - - tests/distributed/ - - tests/v1/shutdown - - tests/v1/worker/test_worker_memory_snapshot.py + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config + - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - - pytest -v -s v1/worker/test_worker_memory_snapshot.py + - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT -- label: Distributed Compile + Comm (4 GPUs) # TBD +- label: Kernels Quantization Test %N # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_4 - num_gpus: 4 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + parallelism: 2 working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/distributed/ - - tests/distributed/test_pynccl - - tests/distributed/test_events - - tests/compile/fullgraph/test_basic_correctness.py - - tests/distributed/test_symm_mem_allreduce.py - - tests/distributed/test_multiproc_executor.py + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + - tests/kernels/quantization/test_rocm_skinny_gemms.py + - vllm/_aiter_ops.py - vllm/platforms/rocm.py + - vllm/model_executor/kernels/ commands: - - pytest -v -s compile/fullgraph/test_basic_correctness.py - - pytest -v -s distributed/test_pynccl.py - - pytest -v -s distributed/test_events.py - - pytest -v -s distributed/test_symm_mem_allreduce.py - - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node - -#---------------------------------------------------------- mi325 · engine -----------------------------------------------------------# - - -#----------------------------------------------------------- mi325 · evals -----------------------------------------------------------# - - -#---------------------------------------------------------- mi325 · models -----------------------------------------------------------# - - -#------------------------------------------------------------ mi325 · v1 -------------------------------------------------------------# - - -######################################################################################################################################### -# # -# MI355 (gfx950) tests # -# # -######################################################################################################################################### + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT -#-------------------------------------------------------- mi355 · benchmarks ---------------------------------------------------------# - - -#-------------------------------------------------------- mi355 · distributed --------------------------------------------------------# - -- label: Distributed Tests (2xH100-2xMI355) # TBD +- label: Kernels FP8 MoE Test (2xH100-2xMI355) # TBD timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_2 - num_gpus: 2 - optional: true - working_dir: "/vllm-workspace/" + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/distributed/ - - vllm/v1/distributed/ + - csrc/moe/ + - csrc/quantization/w8a8/cutlass/moe/ - vllm/model_executor/layers/fused_moe/ - - vllm/v1/attention/backends/ - - vllm/v1/attention/selector.py - - tests/distributed/test_context_parallel.py - - tests/v1/distributed/test_dbo.py - - examples/features/data_parallel/data_parallel_offline.py + - tests/kernels/moe/test_deepep_moe.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py + - vllm/envs.py commands: - - pytest -v -s tests/distributed/test_context_parallel.py - - pytest -v -s tests/v1/distributed/test_dbo.py + - pytest -v -s kernels/moe/test_deepep_moe.py -#-------------------------------------------------------- mi355 · entrypoints --------------------------------------------------------# +#---------------------------------------------------------- mi355 · models -----------------------------------------------------------# + +- label: Language Models Test (Extended Generation) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' +- label: Language Models Test (Extended Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' -#----------------------------------------------------------- mi355 · evals -----------------------------------------------------------# +- label: Language Models Test (PPL) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/model_executor/models/qwen3_5.py + - vllm/model_executor/models/qwen3_5_mtp.py + - vllm/transformers_utils/configs/qwen3_5.py + - vllm/transformers_utils/configs/qwen3_5_moe.py + - vllm/model_executor/models/qwen.py + - vllm/model_executor/models/qwen2.py + - vllm/model_executor/models/qwen3.py + - vllm/model_executor/models/qwen3_next.py + - vllm/model_executor/models/qwen3_next_mtp.py + - vllm/model_executor/layers/fla/ops/ + - vllm/_aiter_ops.py + - vllm/v1/attention/backends/triton_attn.py + - vllm/v1/attention/backends/rocm_attn.py + - vllm/v1/attention/backends/rocm_aiter_unified_attn.py + - vllm/v1/attention/backends/rocm_aiter_fa.py + - vllm/v1/attention/backends/flex_attention.py + - vllm/v1/attention/ops/ + - vllm/platforms/rocm.py + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test +- label: Language Models Tests (Standard) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' -#--------------------------------------------------------- mi355 · examples ----------------------------------------------------------# +- label: Multi-Modal Models (Extended Generation 1) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + - tests/models/multimodal/test_mapping.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py + - pytest -v -s models/multimodal/test_mapping.py +- label: Multi-Modal Models (Extended Generation 3) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' -#---------------------------------------------------------- mi355 · kernels ----------------------------------------------------------# +- label: Multi-Modal Models (Extended Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/pooling + commands: + - pytest -v -s models/multimodal/pooling -m 'not core_model' +- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + torch_nightly: true + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2" + - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model -#---------------------------------------------------------- mi355 · models -----------------------------------------------------------# +- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + torch_nightly: true + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/generation/test_memory_leak.py --ignore models/multimodal/processing + - pytest -v -s models/multimodal/generation/test_memory_leak.py -m core_model + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model +- label: Quantized Models Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - vllm/model_executor/model_loader/ + commands: + - pytest -v -s models/quantization #------------------------------------------------------- mi355 · quantization --------------------------------------------------------# +- label: Quantization # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - uv pip install --system torchao==0.17.0 + - uv pip install --system conch-triton-kernels + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py # - label: Quantized MoE Test (B200-MI355) # TBD # timeout_in_minutes: 180 @@ -647,9 +3246,163 @@ steps: #------------------------------------------------------------ mi355 · v1 -------------------------------------------------------------# +- label: V1 attention (B200-MI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/config/attention.py + - vllm/model_executor/layers/attention + - vllm/v1/attention + - tests/v1/attention + - vllm/_aiter_ops.py + - vllm/envs.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/attention + +- label: V1 Core + KV + Metrics # TBD + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/core + - tests/v1/executor + - tests/v1/kv_offload + - tests/v1/worker + - tests/v1/kv_connector/unit + - tests/v1/metrics + - tests/entrypoints/openai/correctness/test_lmeval.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/worker + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: V1 Sample + Logits # TBD + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/sample + - tests/v1/logits_processors + - tests/v1/test_oracle.py + - tests/v1/test_request.py + - tests/v1/test_outputs.py + commands: + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + +- label: V1 Spec Decode # TBD + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/spec_decode + commands: + - pytest -v -s -m 'not slow_test' v1/spec_decode + +- label: NixlConnector PD + Spec Decode acceptance (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/v1/worker/kv_connector_model_runner_mixin.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh + +- label: Distributed NixlConnector PD accuracy (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_4 + num_gpus: 4 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_4 + num_gpus: 4 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh #------------------------------------------------------ mi355 · weight_loading -------------------------------------------------------# +- label: Weight Loading Multiple GPU # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt + +- label: Weight Loading Multiple GPU - Large Models # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt #----------------------------------------------------------- mi355 · misc ------------------------------------------------------------# +- label: Regression # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py From 0481aca95865329b2ecd459f86159a5ca9fed239 Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Wed, 6 May 2026 16:31:50 +0000 Subject: [PATCH 3/3] remove TORCH_NCCL_BLOCKING_WAIT=1 since it is no longer needed as of ROCm 7.2 Signed-off-by: Micah Williamson --- .buildkite/test-amd.yaml | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 5b3eb4f79c5d..b0fb7705b7e8 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -230,7 +230,6 @@ steps: - tests/entrypoints/llm/test_collective_rpc.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py @@ -272,7 +271,6 @@ steps: - tests/v1/worker/test_worker_memory_snapshot.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown @@ -590,7 +588,6 @@ steps: - vllm/platforms/rocm.py commands: - pip freeze | grep -E 'torch' - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB - label: Multi-Modal Models (Extended Generation 2) # TBD @@ -865,7 +862,6 @@ steps: - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py @@ -1174,7 +1170,6 @@ steps: - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s tests/distributed/test_context_parallel.py - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization @@ -1188,7 +1183,6 @@ steps: source_file_dependencies: - vllm/ commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s distributed/test_custom_all_reduce.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' @@ -1208,7 +1202,6 @@ steps: - tests/examples/features/data_parallel/data_parallel_offline.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py @@ -1254,7 +1247,6 @@ steps: - vllm/platforms/rocm.py commands: - export VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1 - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s distributed/test_ray_v2_executor.py - pytest -v -s distributed/test_ray_v2_executor_e2e.py - pytest -v -s distributed/test_pipeline_parallel.py -k "ray" @@ -1276,7 +1268,6 @@ steps: - vllm/v1/worker/gpu_worker.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep #-------------------------------------------------------- mi300 · entrypoints --------------------------------------------------------# @@ -2283,7 +2274,6 @@ steps: - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py @@ -2303,7 +2293,6 @@ steps: - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py @@ -2366,7 +2355,6 @@ steps: - tests/distributed/test_utils - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py @@ -2496,7 +2484,6 @@ steps: - tests/entrypoints/llm/test_collective_rpc.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py @@ -2521,7 +2508,6 @@ steps: - tests/v1/worker/test_worker_memory_snapshot.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown @@ -2542,7 +2528,6 @@ steps: - tests/distributed/test_multiproc_executor.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py @@ -2722,7 +2707,6 @@ steps: - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/v1/distributed/test_dbo.py