Skip to content

Commit b97c80d

Browse files
committed
Upd
Signed-off-by: Shu Wang. <[email protected]>
2 parents 594196e + 964d65d commit b97c80d

File tree

369 files changed

+8786
-2217
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

369 files changed

+8786
-2217
lines changed

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ while true; do
5959
fi
6060
done
6161

62-
echo "--- Pulling container"
62+
echo "--- Pulling container"
6363
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
6464
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
6565
docker pull "${image_name}"
@@ -177,13 +177,13 @@ if [[ -z "$render_gid" ]]; then
177177
exit 1
178178
fi
179179

180-
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
180+
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
181181
if [[ $commands == *"--shard-id="* ]]; then
182-
# assign job count as the number of shards used
183-
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
182+
# assign job count as the number of shards used
183+
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
184184
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
185185
# assign shard-id for each shard
186-
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
186+
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
187187
echo "Shard ${GPU} commands:$commands_gpu"
188188
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
189189
docker run \

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,6 @@ docker run \
4646
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
4747
pytest -v -s v1/structured_output
4848
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
49-
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
49+
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
5050
pytest -v -s v1/test_serial_utils.py
5151
'

.buildkite/test-amd.yaml

Lines changed: 153 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,27 @@ steps:
226226
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
227227
- popd
228228

229+
- label: Distributed Tests (8 GPUs) # 4min
230+
timeout_in_minutes: 10
231+
mirror_hardwares: [amdexperimental]
232+
agent_pool: mi325_8
233+
# grade: Blocking
234+
gpu: h100
235+
num_gpus: 8
236+
working_dir: "/vllm-workspace/tests"
237+
source_file_dependencies:
238+
- examples/offline_inference/torchrun_dp_example.py
239+
- vllm/config/parallel.py
240+
- vllm/distributed/
241+
- vllm/v1/engine/llm_engine.py
242+
- vllm/v1/executor/uniproc_executor.py
243+
- vllm/v1/worker/gpu_worker.py
244+
commands:
245+
# https://github.com/NVIDIA/nccl/issues/1838
246+
#- export NCCL_CUMEM_HOST_ENABLE=0
247+
# test with torchrun tp=2 and dp=4 with ep
248+
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
249+
229250
- label: EPLB Algorithm Test # 5min
230251
mirror_hardwares: [amdexperimental, amdproduction]
231252
agent_pool: mi325_1
@@ -238,18 +259,19 @@ steps:
238259
commands:
239260
- pytest -v -s distributed/test_eplb_algo.py
240261

241-
- label: EPLB Execution Test # 5min
262+
- label: EPLB Execution Test # 10min
242263
mirror_hardwares: [amdexperimental, amdproduction]
243264
agent_pool: mi325_4
244265
# grade: Blocking
245-
timeout_in_minutes: 15
266+
timeout_in_minutes: 20
246267
working_dir: "/vllm-workspace/tests"
247268
num_gpus: 4
248269
source_file_dependencies:
249270
- vllm/distributed/eplb
250271
- tests/distributed/test_eplb_execute.py
251272
commands:
252273
- pytest -v -s distributed/test_eplb_execute.py
274+
- pytest -v -s distributed/test_eplb_spec_decode.py
253275

254276
- label: Metrics, Tracing Test # 12min
255277
timeout_in_minutes: 20
@@ -273,7 +295,7 @@ steps:
273295

274296
- label: Regression Test # 7min
275297
timeout_in_minutes: 20
276-
mirror_hardwares: [amdexperimental, amdproduction]
298+
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
277299
agent_pool: mi325_1
278300
grade: Blocking
279301
source_file_dependencies:
@@ -288,7 +310,7 @@ steps:
288310
timeout_in_minutes: 40
289311
mirror_hardwares: [amdexperimental, amdproduction]
290312
agent_pool: mi325_1
291-
#grade: Blocking
313+
# grade: Blocking
292314
source_file_dependencies:
293315
- vllm/
294316
- tests/engine
@@ -337,14 +359,15 @@ steps:
337359
- tests/v1
338360
commands:
339361
# split the test to avoid interference
362+
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
340363
- pytest -v -s -m 'not cpu_test' v1/core
341364
- pytest -v -s v1/executor
342365
- pytest -v -s v1/kv_offload
343366
- pytest -v -s v1/sample
344367
- pytest -v -s v1/logits_processors
345368
- pytest -v -s v1/worker
346369
- pytest -v -s v1/spec_decode
347-
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py
370+
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
348371
- pytest -v -s -m 'not cpu_test' v1/metrics
349372
- pytest -v -s v1/test_oracle.py
350373
- pytest -v -s v1/test_request.py
@@ -353,6 +376,20 @@ steps:
353376
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
354377
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
355378

379+
# TODO: Add the "V1 Test attetion (MI300)" test group
380+
381+
- label: V1 Test attention (H100) # 10min
382+
mirror_hardwares: [amdexperimental]
383+
agent_pool: mi325_1
384+
# grade: Blocking
385+
timeout_in_minutes: 30
386+
gpu: h100
387+
source_file_dependencies:
388+
- vllm/v1/attention
389+
- tests/v1/attention
390+
commands:
391+
- pytest -v -s v1/attention
392+
356393
- label: V1 Test others (CPU) # 5 mins
357394
mirror_hardwares: [amdexperimental, amdproduction]
358395
agent_pool: mi325_1
@@ -479,10 +516,11 @@ steps:
479516
- tests/compile
480517
commands:
481518
- pytest -v -s compile/test_basic_correctness.py
519+
- pytest -v -s compile/test_multimodal_compile.py
482520
- pytest -v -s compile/piecewise/
483521

484-
- label: PyTorch Fullgraph Test # 22min
485-
timeout_in_minutes: 35
522+
- label: PyTorch Fullgraph Test # 27min
523+
timeout_in_minutes: 40
486524
mirror_hardwares: [amdexperimental, amdproduction]
487525
agent_pool: mi325_1
488526
# grade: Blocking
@@ -491,8 +529,23 @@ steps:
491529
- vllm/
492530
- tests/compile
493531
commands:
494-
- pytest -v -s compile/test_full_graph.py
495-
- pytest -v -s compile/test_fusions_e2e.py
532+
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
533+
# Limit to no custom ops to reduce running time
534+
# Wrap with quotes to escape yaml and avoid starting -k string with a -
535+
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
536+
537+
- label: Cudagraph test
538+
timeout_in_minutes: 20
539+
mirror_hardwares: [amdexperimental, amdproduction]
540+
agent_pool: mi325_1
541+
source_file_dependencies:
542+
- tests/v1/cudagraph
543+
- vllm/v1/cudagraph_dispatcher.py
544+
- vllm/config/compilation.py
545+
- vllm/compilation
546+
commands:
547+
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
548+
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
496549

497550
- label: Kernels Core Operation Test # 48min
498551
timeout_in_minutes: 75
@@ -544,6 +597,8 @@ steps:
544597
- tests/kernels/moe
545598
- vllm/model_executor/layers/fused_moe/
546599
- vllm/distributed/device_communicators/
600+
- vllm/envs.py
601+
- vllm/config
547602
commands:
548603
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
549604
parallelism: 2
@@ -562,10 +617,13 @@ steps:
562617

563618
- label: Model Executor Test # 23min
564619
timeout_in_minutes: 35
620+
torch_nightly: true
565621
mirror_hardwares: [amdexperimental, amdproduction]
566622
agent_pool: mi325_1
567623
# grade: Blocking
568624
source_file_dependencies:
625+
- vllm/engine/arg_utils.py
626+
- vllm/config/model.py
569627
- vllm/model_executor
570628
- tests/model_executor
571629
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -861,9 +919,10 @@ steps:
861919
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
862920

863921
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
922+
timeout_in_minutes: 70
864923
mirror_hardwares: [amdexperimental, amdproduction]
865924
agent_pool: mi325_1
866-
timeout_in_minutes: 15
925+
# grade: Blocking
867926
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
868927
source_file_dependencies:
869928
- vllm/multimodal/
@@ -934,6 +993,7 @@ steps:
934993
- label: Transformers Nightly Models Test
935994
mirror_hardwares: [amdexperimental]
936995
agent_pool: mi325_1
996+
# grade: Blocking
937997
working_dir: "/vllm-workspace/"
938998
optional: true
939999
commands:
@@ -961,11 +1021,16 @@ steps:
9611021
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
9621022
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
9631023
- vllm/v1/attention/backends/flashinfer.py
1024+
- vllm/v1/attention/backends/mla/cutlass_mla.py
1025+
- vllm/v1/attention/backends/mla/flashinfer_mla.py
1026+
- vllm/platforms/cuda.py
1027+
- vllm/attention/selector.py
9641028
commands:
9651029
- nvidia-smi
9661030
- python3 examples/offline_inference/basic/chat.py
9671031
# Attention
9681032
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
1033+
- pytest -v -s tests/kernels/attention/test_attention_selector.py
9691034
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
9701035
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
9711036
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -1002,7 +1067,33 @@ steps:
10021067
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
10031068
# this runner has 2 GPUs available even though num_gpus=2 is not set
10041069
- pytest -v -s tests/compile/test_fusion_all_reduce.py
1070+
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
1071+
# Wrap with quotes to escape yaml
1072+
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
1073+
1074+
- label: Blackwell Fusion E2E Tests # 30 min
1075+
timeout_in_minutes: 40
1076+
working_dir: "/vllm-workspace/"
1077+
gpu: b200
1078+
optional: true
1079+
num_gpus: 2
1080+
source_file_dependencies:
1081+
- csrc/quantization/fp4/
1082+
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
1083+
- vllm/v1/attention/backends/flashinfer.py
1084+
- vllm/compilation/
1085+
# can affect pattern matching
1086+
- vllm/model_executor/layers/layernorm.py
1087+
- vllm/model_executor/layers/activation.py
1088+
- vllm/model_executor/layers/quantization/input_quant_fp8.py
1089+
- tests/compile/test_fusions_e2e.py
1090+
- tests/compile/test_full_graph.py
1091+
commands:
1092+
- nvidia-smi
1093+
# Run all e2e fusion tests
10051094
- pytest -v -s tests/compile/test_fusions_e2e.py
1095+
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1096+
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
10061097

10071098
- label: Blackwell GPT-OSS Eval
10081099
timeout_in_minutes: 60
@@ -1253,6 +1344,7 @@ steps:
12531344
- label: NixlConnector PD accuracy tests (Distributed) # 30min
12541345
mirror_hardwares: [amdexperimental]
12551346
agent_pool: mi325_4
1347+
# grade: Blocking
12561348
timeout_in_minutes: 30
12571349
working_dir: "/vllm-workspace/tests"
12581350
num_gpus: 4
@@ -1267,6 +1359,9 @@ steps:
12671359
##### A100 test #####
12681360

12691361
- label: Distributed Tests (A100) # optional
1362+
mirror_hardwares: [amdexperimental]
1363+
agent_pool: mi325_4
1364+
# grade: Blocking
12701365
gpu: a100
12711366
optional: true
12721367
num_gpus: 4
@@ -1281,6 +1376,9 @@ steps:
12811376
- pytest -v -s -x lora/test_mixtral.py
12821377

12831378
- label: LM Eval Large Models # optional
1379+
mirror_hardwares: [amdexperimental, amdproduction]
1380+
agent_pool: mi325_4
1381+
# grade: Blocking
12841382
gpu: a100
12851383
optional: true
12861384
num_gpus: 4
@@ -1292,8 +1390,27 @@ steps:
12921390
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
12931391
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
12941392

1393+
##### H100 test #####
1394+
- label: LM Eval Large Models (H100) # optional
1395+
mirror_hardwares: [amdexperimental, amdproduction]
1396+
agent_pool: mi325_4
1397+
# grade: Blocking
1398+
gpu: h100
1399+
optional: true
1400+
num_gpus: 4
1401+
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
1402+
source_file_dependencies:
1403+
- csrc/
1404+
- vllm/model_executor/layers/quantization
1405+
commands:
1406+
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
1407+
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
1408+
12951409
##### H200 test #####
12961410
- label: Distributed Tests (H200) # optional
1411+
mirror_hardwares: [amdexperimental]
1412+
agent_pool: mi325_2
1413+
# grade: Blocking
12971414
gpu: h200
12981415
optional: true
12991416
working_dir: "/vllm-workspace/"
@@ -1305,6 +1422,7 @@ steps:
13051422
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
13061423
- pytest -v -s tests/distributed/test_context_parallel.py
13071424
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
1425+
- pytest -v -s tests/v1/distributed/test_dbo.py
13081426

13091427
##### B200 test #####
13101428
- label: Distributed Tests (B200) # optional
@@ -1315,6 +1433,7 @@ steps:
13151433
commands:
13161434
- pytest -v -s tests/distributed/test_context_parallel.py
13171435
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
1436+
- pytest -v -s tests/v1/distributed/test_dbo.py
13181437

13191438
##### RL Integration Tests #####
13201439
- label: Prime-RL Integration Test # 15min
@@ -1330,3 +1449,27 @@ steps:
13301449
- .buildkite/scripts/run-prime-rl-test.sh
13311450
commands:
13321451
- bash .buildkite/scripts/run-prime-rl-test.sh
1452+
1453+
- label: DeepSeek V2-Lite Accuracy
1454+
mirror_hardwares: [amdexperimental]
1455+
agent_pool: mi325_4
1456+
# grade: Blocking
1457+
timeout_in_minutes: 60
1458+
gpu: h100
1459+
optional: true
1460+
num_gpus: 4
1461+
working_dir: "/vllm-workspace"
1462+
commands:
1463+
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
1464+
1465+
- label: Qwen3-30B-A3B-FP8-block Accuracy
1466+
mirror_hardwares: [amdexperimental]
1467+
agent_pool: mi325_4
1468+
# grade: Blocking
1469+
timeout_in_minutes: 60
1470+
gpu: h100
1471+
optional: true
1472+
num_gpus: 4
1473+
working_dir: "/vllm-workspace"
1474+
commands:
1475+
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020

0 commit comments

Comments
 (0)