diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml index 759d2b535871..4498bebaf47e 100644 --- a/.buildkite/test_areas/basic_correctness.yaml +++ b/.buildkite/test_areas/basic_correctness.yaml @@ -14,3 +14,8 @@ steps: - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py + mirror: + amd: + device: mi250_1 + depends_on: + - image-build-amd diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index f94f831a49e2..4cdd17b798ec 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -49,6 +49,27 @@ steps: - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py + mirror: + amd: + device: mi250_2 + depends_on: + - image-build-amd + commands: + # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 + # TODO: Remove when the bug is fixed in a future ROCm release + - export TORCH_NCCL_BLOCKING_WAIT=1 + # NOTE: The rest is in complete parity with CUDA tests + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py - label: Distributed Torchrun + Examples (4 GPUs) timeout_in_minutes: 30 @@ -87,6 +108,40 @@ steps: - cd new_weight_syncing - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py + mirror: + amd: + device: mi250_4 + depends_on: + - image-build-amd + commands: + # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 + # TODO: Remove when the bug is fixed in a future ROCm release + - export TORCH_NCCL_BLOCKING_WAIT=1 + # NOTE: The rest is in complete parity with CUDA tests + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + - cd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - cd new_weight_syncing + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py - label: Distributed DP Tests (4 GPUs) timeout_in_minutes: 30 @@ -161,7 +216,7 @@ steps: - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py -- label: Distributed Tests (2 GPUs)(H100) +- label: Distributed Tests (2 GPUs)(H100-MI325) timeout_in_minutes: 15 device: h100 optional: true @@ -172,6 +227,16 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py + mirror: + amd: + device: mi250_2 + depends_on: + - image-build-amd + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization + - pytest -v -s tests/v1/distributed/test_dbo.py - label: Distributed Tests (2 GPUs)(B200) device: b200 diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml index be83bab8fa29..b9c37a383f75 100644 --- a/.buildkite/test_areas/engine.yaml +++ b/.buildkite/test_areas/engine.yaml @@ -22,6 +22,11 @@ steps: commands: - pytest -v -s v1/engine/test_preprocess_error_handling.py - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: e2e Scheduling (1 GPU) timeout_in_minutes: 30 @@ -30,6 +35,11 @@ steps: - tests/v1/e2e/general/ commands: - pytest -v -s v1/e2e/general/test_async_scheduling.py + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: e2e Core (1 GPU) timeout_in_minutes: 30 @@ -38,6 +48,11 @@ steps: - tests/v1/e2e/general/ commands: - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: V1 e2e (2 GPUs) timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability @@ -51,7 +66,7 @@ steps: - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" mirror: amd: - device: mi325_2 + device: mi250_2 depends_on: - image-build-amd diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 9de9c3fd2dda..1cdf9d4832b0 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -55,6 +55,11 @@ steps: - pytest -v -s entrypoints/instrumentator - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use + mirror: + amd: + device: mi250_1 + depends_on: + - image-build-amd - label: Entrypoints Integration (Pooling) timeout_in_minutes: 50 @@ -84,7 +89,7 @@ steps: - pytest -v -s v1/entrypoints mirror: amd: - device: mi325_1 + device: mi250_1 depends_on: - image-build-amd diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml index f034175cc1b8..85af12faacaf 100644 --- a/.buildkite/test_areas/lora.yaml +++ b/.buildkite/test_areas/lora.yaml @@ -10,7 +10,11 @@ steps: commands: - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py parallelism: 4 - + mirror: + amd: + device: mi250_1 + depends_on: + - image-build-amd - label: LoRA TP (Distributed) timeout_in_minutes: 30 diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index 9280696d13b7..f12ca37adcf9 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -88,6 +88,11 @@ steps: - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + mirror: + amd: + device: mi250_1 + depends_on: + - image-build-amd - label: Metrics, Tracing (2 GPUs) timeout_in_minutes: 20 diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml index c1cc9e9a36e0..b348a687d924 100644 --- a/.buildkite/test_areas/models_basic.yaml +++ b/.buildkite/test_areas/models_basic.yaml @@ -38,7 +38,7 @@ steps: - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py mirror: amd: - device: mi325_1 + device: mi250_1 depends_on: - image-build-amd diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml index a3bd21ccff3c..488b5f7cf986 100644 --- a/.buildkite/test_areas/models_language.yaml +++ b/.buildkite/test_areas/models_language.yaml @@ -12,6 +12,11 @@ steps: # Test standard language models, excluding a subset of slow tests - pip freeze | grep -E 'torch' - pytest -v -s models/language -m 'core_model and (not slow_test)' + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: Language Models Tests (Extra Standard) %N timeout_in_minutes: 45 @@ -27,6 +32,16 @@ steps: - pip freeze | grep -E 'torch' - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB parallelism: 2 + mirror: + amd: + device: mi250_1 + depends_on: + - image-build-amd + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + # NOTE: The rest is in complete parity with CUDA tests + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB - label: Language Models Tests (Hybrid) %N timeout_in_minutes: 75 @@ -84,7 +99,7 @@ steps: - pytest -v -s models/language/pooling -m 'not core_model' mirror: amd: - device: mi325_1 + device: mi250_1 depends_on: - image-build-amd diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml index eb10bf6c71c2..13d7aeb4df7e 100644 --- a/.buildkite/test_areas/models_multimodal.yaml +++ b/.buildkite/test_areas/models_multimodal.yaml @@ -84,6 +84,11 @@ steps: commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/processing/test_tensor_schema.py + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: Multi-Modal Accuracy Eval (Small Models) # 50min timeout_in_minutes: 70 @@ -117,6 +122,11 @@ steps: commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + mirror: + amd: + device: mi250_1 + depends_on: + - image-build-amd - label: Multi-Modal Models (Extended) 3 optional: true diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml index 7e7727fce7df..3b703efd6c65 100644 --- a/.buildkite/test_areas/plugins.yaml +++ b/.buildkite/test_areas/plugins.yaml @@ -39,3 +39,8 @@ steps: - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins + mirror: + amd: + device: mi250_2 + depends_on: + - image-build-amd diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml index 5ee2e5186966..5465c0bc0fe5 100644 --- a/.buildkite/test_areas/quantization.yaml +++ b/.buildkite/test_areas/quantization.yaml @@ -44,3 +44,8 @@ steps: - tests/models/quantization commands: - pytest -v -s models/quantization + mirror: + amd: + device: mi355_1 + depends_on: + - image-build-amd diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml index 2052a379827a..b782f188e220 100644 --- a/.buildkite/test_areas/samplers.yaml +++ b/.buildkite/test_areas/samplers.yaml @@ -14,7 +14,7 @@ steps: - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers mirror: amd: - device: mi325_1 + device: mi250_1 depends_on: - image-build-amd commands: