Skip to content
5 changes: 5 additions & 0 deletions .buildkite/test_areas/basic_correctness.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,8 @@ steps:
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
mirror:
amd:
device: mi250_1
depends_on:
- image-build-amd
67 changes: 66 additions & 1 deletion .buildkite/test_areas/distributed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,27 @@ steps:
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
mirror:
amd:
device: mi250_2
depends_on:
- image-build-amd
commands:
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
- export TORCH_NCCL_BLOCKING_WAIT=1
# NOTE: The rest is in complete parity with CUDA tests
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py

- label: Distributed Torchrun + Examples (4 GPUs)
timeout_in_minutes: 30
Expand Down Expand Up @@ -87,6 +108,40 @@ steps:
- cd new_weight_syncing
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
mirror:
amd:
device: mi250_4
depends_on:
- image-build-amd
commands:
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
- export TORCH_NCCL_BLOCKING_WAIT=1
# NOTE: The rest is in complete parity with CUDA tests
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/fullgraph/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
- cd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- cd new_weight_syncing
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py

- label: Distributed DP Tests (4 GPUs)
timeout_in_minutes: 30
Expand Down Expand Up @@ -161,7 +216,7 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py

- label: Distributed Tests (2 GPUs)(H100)
- label: Distributed Tests (2 GPUs)(H100-MI325)
timeout_in_minutes: 15
device: h100
optional: true
Expand All @@ -172,6 +227,16 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
mirror:
amd:
device: mi250_2
depends_on:
- image-build-amd
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
- VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
- pytest -v -s tests/v1/distributed/test_dbo.py

- label: Distributed Tests (2 GPUs)(B200)
device: b200
Expand Down
17 changes: 16 additions & 1 deletion .buildkite/test_areas/engine.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ steps:
commands:
- pytest -v -s v1/engine/test_preprocess_error_handling.py
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd

- label: e2e Scheduling (1 GPU)
timeout_in_minutes: 30
Expand All @@ -30,6 +35,11 @@ steps:
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general/test_async_scheduling.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd

- label: e2e Core (1 GPU)
timeout_in_minutes: 30
Expand All @@ -38,6 +48,11 @@ steps:
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd

- label: V1 e2e (2 GPUs)
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
Expand All @@ -51,7 +66,7 @@ steps:
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
mirror:
amd:
device: mi325_2
device: mi250_2
depends_on:
- image-build-amd

Expand Down
7 changes: 6 additions & 1 deletion .buildkite/test_areas/entrypoints.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ steps:
- pytest -v -s entrypoints/instrumentator
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s tool_use
mirror:
amd:
device: mi250_1
depends_on:
- image-build-amd

- label: Entrypoints Integration (Pooling)
timeout_in_minutes: 50
Expand Down Expand Up @@ -84,7 +89,7 @@ steps:
- pytest -v -s v1/entrypoints
mirror:
amd:
device: mi325_1
device: mi250_1
depends_on:
- image-build-amd

Expand Down
6 changes: 5 additions & 1 deletion .buildkite/test_areas/lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ steps:
commands:
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
parallelism: 4

mirror:
amd:
device: mi250_1
depends_on:
- image-build-amd

- label: LoRA TP (Distributed)
timeout_in_minutes: 30
Expand Down
5 changes: 5 additions & 0 deletions .buildkite/test_areas/misc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ steps:
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
mirror:
amd:
device: mi250_1
depends_on:
- image-build-amd

- label: Metrics, Tracing (2 GPUs)
timeout_in_minutes: 20
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/test_areas/models_basic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ steps:
- pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
mirror:
amd:
device: mi325_1
device: mi250_1
depends_on:
- image-build-amd

Expand Down
17 changes: 16 additions & 1 deletion .buildkite/test_areas/models_language.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ steps:
# Test standard language models, excluding a subset of slow tests
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m 'core_model and (not slow_test)'
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd

- label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45
Expand All @@ -27,6 +32,16 @@ steps:
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
mirror:
amd:
device: mi250_1
depends_on:
- image-build-amd
commands:
- export TORCH_NCCL_BLOCKING_WAIT=1
# NOTE: The rest is in complete parity with CUDA tests
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB

- label: Language Models Tests (Hybrid) %N
timeout_in_minutes: 75
Expand Down Expand Up @@ -84,7 +99,7 @@ steps:
- pytest -v -s models/language/pooling -m 'not core_model'
mirror:
amd:
device: mi325_1
device: mi250_1
depends_on:
- image-build-amd

Expand Down
10 changes: 10 additions & 0 deletions .buildkite/test_areas/models_multimodal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ steps:
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd

- label: Multi-Modal Accuracy Eval (Small Models) # 50min
timeout_in_minutes: 70
Expand Down Expand Up @@ -117,6 +122,11 @@ steps:
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
mirror:
amd:
device: mi250_1
depends_on:
- image-build-amd

- label: Multi-Modal Models (Extended) 3
optional: true
Expand Down
5 changes: 5 additions & 0 deletions .buildkite/test_areas/plugins.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,8 @@ steps:
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
mirror:
amd:
device: mi250_2
depends_on:
- image-build-amd
5 changes: 5 additions & 0 deletions .buildkite/test_areas/quantization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,8 @@ steps:
- tests/models/quantization
commands:
- pytest -v -s models/quantization
mirror:
amd:
device: mi355_1
depends_on:
- image-build-amd
2 changes: 1 addition & 1 deletion .buildkite/test_areas/samplers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ steps:
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
mirror:
amd:
device: mi325_1
device: mi250_1
depends_on:
- image-build-amd
commands:
Expand Down
Loading