@@ -226,6 +226,27 @@ steps:
226226 - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
227227 - popd
228228
229+ - label : Distributed Tests (8 GPUs) # 4min
230+ timeout_in_minutes : 10
231+ mirror_hardwares : [amdexperimental]
232+ agent_pool : mi325_8
233+ # grade: Blocking
234+ gpu : h100
235+ num_gpus : 8
236+ working_dir : " /vllm-workspace/tests"
237+ source_file_dependencies :
238+ - examples/offline_inference/torchrun_dp_example.py
239+ - vllm/config/parallel.py
240+ - vllm/distributed/
241+ - vllm/v1/engine/llm_engine.py
242+ - vllm/v1/executor/uniproc_executor.py
243+ - vllm/v1/worker/gpu_worker.py
244+ commands :
245+ # https://github.com/NVIDIA/nccl/issues/1838
246+ # - export NCCL_CUMEM_HOST_ENABLE=0
247+ # test with torchrun tp=2 and dp=4 with ep
248+ - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
249+
229250- label : EPLB Algorithm Test # 5min
230251 mirror_hardwares : [amdexperimental, amdproduction]
231252 agent_pool : mi325_1
@@ -238,18 +259,19 @@ steps:
238259 commands :
239260 - pytest -v -s distributed/test_eplb_algo.py
240261
241- - label : EPLB Execution Test # 5min
262+ - label : EPLB Execution Test # 10min
242263 mirror_hardwares : [amdexperimental, amdproduction]
243264 agent_pool : mi325_4
244265 # grade: Blocking
245- timeout_in_minutes : 15
266+ timeout_in_minutes : 20
246267 working_dir : " /vllm-workspace/tests"
247268 num_gpus : 4
248269 source_file_dependencies :
249270 - vllm/distributed/eplb
250271 - tests/distributed/test_eplb_execute.py
251272 commands :
252273 - pytest -v -s distributed/test_eplb_execute.py
274+ - pytest -v -s distributed/test_eplb_spec_decode.py
253275
254276- label : Metrics, Tracing Test # 12min
255277 timeout_in_minutes : 20
@@ -273,7 +295,7 @@ steps:
273295
274296- label : Regression Test # 7min
275297 timeout_in_minutes : 20
276- mirror_hardwares : [amdexperimental, amdproduction]
298+ mirror_hardwares : [amdexperimental, amdproduction, amdtentative ]
277299 agent_pool : mi325_1
278300 grade : Blocking
279301 source_file_dependencies :
@@ -288,7 +310,7 @@ steps:
288310 timeout_in_minutes : 40
289311 mirror_hardwares : [amdexperimental, amdproduction]
290312 agent_pool : mi325_1
291- # grade: Blocking
313+ # grade: Blocking
292314 source_file_dependencies :
293315 - vllm/
294316 - tests/engine
@@ -337,14 +359,15 @@ steps:
337359 - tests/v1
338360 commands :
339361 # split the test to avoid interference
362+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
340363 - pytest -v -s -m 'not cpu_test' v1/core
341364 - pytest -v -s v1/executor
342365 - pytest -v -s v1/kv_offload
343366 - pytest -v -s v1/sample
344367 - pytest -v -s v1/logits_processors
345368 - pytest -v -s v1/worker
346369 - pytest -v -s v1/spec_decode
347- - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py
370+ - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
348371 - pytest -v -s -m 'not cpu_test' v1/metrics
349372 - pytest -v -s v1/test_oracle.py
350373 - pytest -v -s v1/test_request.py
@@ -353,6 +376,20 @@ steps:
353376 - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
354377 - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
355378
379+ # TODO: Add the "V1 Test attetion (MI300)" test group
380+
381+ - label : V1 Test attention (H100) # 10min
382+ mirror_hardwares : [amdexperimental]
383+ agent_pool : mi325_1
384+ # grade: Blocking
385+ timeout_in_minutes : 30
386+ gpu : h100
387+ source_file_dependencies :
388+ - vllm/v1/attention
389+ - tests/v1/attention
390+ commands :
391+ - pytest -v -s v1/attention
392+
356393- label : V1 Test others (CPU) # 5 mins
357394 mirror_hardwares : [amdexperimental, amdproduction]
358395 agent_pool : mi325_1
@@ -479,10 +516,11 @@ steps:
479516 - tests/compile
480517 commands :
481518 - pytest -v -s compile/test_basic_correctness.py
519+ - pytest -v -s compile/test_multimodal_compile.py
482520 - pytest -v -s compile/piecewise/
483521
484- - label : PyTorch Fullgraph Test # 22min
485- timeout_in_minutes : 35
522+ - label : PyTorch Fullgraph Test # 27min
523+ timeout_in_minutes : 40
486524 mirror_hardwares : [amdexperimental, amdproduction]
487525 agent_pool : mi325_1
488526 # grade: Blocking
@@ -491,8 +529,23 @@ steps:
491529 - vllm/
492530 - tests/compile
493531 commands :
494- - pytest -v -s compile/test_full_graph.py
495- - pytest -v -s compile/test_fusions_e2e.py
532+ - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
533+ # Limit to no custom ops to reduce running time
534+ # Wrap with quotes to escape yaml and avoid starting -k string with a -
535+ - " pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
536+
537+ - label : Cudagraph test
538+ timeout_in_minutes : 20
539+ mirror_hardwares : [amdexperimental, amdproduction]
540+ agent_pool : mi325_1
541+ source_file_dependencies :
542+ - tests/v1/cudagraph
543+ - vllm/v1/cudagraph_dispatcher.py
544+ - vllm/config/compilation.py
545+ - vllm/compilation
546+ commands :
547+ - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
548+ - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
496549
497550- label : Kernels Core Operation Test # 48min
498551 timeout_in_minutes : 75
@@ -544,6 +597,8 @@ steps:
544597 - tests/kernels/moe
545598 - vllm/model_executor/layers/fused_moe/
546599 - vllm/distributed/device_communicators/
600+ - vllm/envs.py
601+ - vllm/config
547602 commands :
548603 - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
549604 parallelism : 2
@@ -562,10 +617,13 @@ steps:
562617
563618- label : Model Executor Test # 23min
564619 timeout_in_minutes : 35
620+ torch_nightly : true
565621 mirror_hardwares : [amdexperimental, amdproduction]
566622 agent_pool : mi325_1
567623 # grade: Blocking
568624 source_file_dependencies :
625+ - vllm/engine/arg_utils.py
626+ - vllm/config/model.py
569627 - vllm/model_executor
570628 - tests/model_executor
571629 - tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -861,9 +919,10 @@ steps:
861919 - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
862920
863921- label : Multi-Modal Accuracy Eval (Small Models) # 10min
922+ timeout_in_minutes : 70
864923 mirror_hardwares : [amdexperimental, amdproduction]
865924 agent_pool : mi325_1
866- timeout_in_minutes : 15
925+ # grade: Blocking
867926 working_dir : " /vllm-workspace/.buildkite/lm-eval-harness"
868927 source_file_dependencies :
869928 - vllm/multimodal/
@@ -934,6 +993,7 @@ steps:
934993- label : Transformers Nightly Models Test
935994 mirror_hardwares : [amdexperimental]
936995 agent_pool : mi325_1
996+ # grade: Blocking
937997 working_dir : " /vllm-workspace/"
938998 optional : true
939999 commands :
@@ -961,11 +1021,16 @@ steps:
9611021 - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
9621022 - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
9631023 - vllm/v1/attention/backends/flashinfer.py
1024+ - vllm/v1/attention/backends/mla/cutlass_mla.py
1025+ - vllm/v1/attention/backends/mla/flashinfer_mla.py
1026+ - vllm/platforms/cuda.py
1027+ - vllm/attention/selector.py
9641028 commands :
9651029 - nvidia-smi
9661030 - python3 examples/offline_inference/basic/chat.py
9671031 # Attention
9681032 # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
1033+ - pytest -v -s tests/kernels/attention/test_attention_selector.py
9691034 - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
9701035 - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
9711036 - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -1002,7 +1067,33 @@ steps:
10021067 - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
10031068 # this runner has 2 GPUs available even though num_gpus=2 is not set
10041069 - pytest -v -s tests/compile/test_fusion_all_reduce.py
1070+ # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
1071+ # Wrap with quotes to escape yaml
1072+ - " pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
1073+
1074+ - label : Blackwell Fusion E2E Tests # 30 min
1075+ timeout_in_minutes : 40
1076+ working_dir : " /vllm-workspace/"
1077+ gpu : b200
1078+ optional : true
1079+ num_gpus : 2
1080+ source_file_dependencies :
1081+ - csrc/quantization/fp4/
1082+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
1083+ - vllm/v1/attention/backends/flashinfer.py
1084+ - vllm/compilation/
1085+ # can affect pattern matching
1086+ - vllm/model_executor/layers/layernorm.py
1087+ - vllm/model_executor/layers/activation.py
1088+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
1089+ - tests/compile/test_fusions_e2e.py
1090+ - tests/compile/test_full_graph.py
1091+ commands :
1092+ - nvidia-smi
1093+ # Run all e2e fusion tests
10051094 - pytest -v -s tests/compile/test_fusions_e2e.py
1095+ # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1096+ - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
10061097
10071098- label : Blackwell GPT-OSS Eval
10081099 timeout_in_minutes : 60
@@ -1253,6 +1344,7 @@ steps:
12531344- label : NixlConnector PD accuracy tests (Distributed) # 30min
12541345 mirror_hardwares : [amdexperimental]
12551346 agent_pool : mi325_4
1347+ # grade: Blocking
12561348 timeout_in_minutes : 30
12571349 working_dir : " /vllm-workspace/tests"
12581350 num_gpus : 4
@@ -1267,6 +1359,9 @@ steps:
12671359# #### A100 test #####
12681360
12691361- label : Distributed Tests (A100) # optional
1362+ mirror_hardwares : [amdexperimental]
1363+ agent_pool : mi325_4
1364+ # grade: Blocking
12701365 gpu : a100
12711366 optional : true
12721367 num_gpus : 4
@@ -1281,6 +1376,9 @@ steps:
12811376 - pytest -v -s -x lora/test_mixtral.py
12821377
12831378- label : LM Eval Large Models # optional
1379+ mirror_hardwares : [amdexperimental, amdproduction]
1380+ agent_pool : mi325_4
1381+ # grade: Blocking
12841382 gpu : a100
12851383 optional : true
12861384 num_gpus : 4
@@ -1292,8 +1390,27 @@ steps:
12921390 - export VLLM_WORKER_MULTIPROC_METHOD=spawn
12931391 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
12941392
1393+ # #### H100 test #####
1394+ - label : LM Eval Large Models (H100) # optional
1395+ mirror_hardwares : [amdexperimental, amdproduction]
1396+ agent_pool : mi325_4
1397+ # grade: Blocking
1398+ gpu : h100
1399+ optional : true
1400+ num_gpus : 4
1401+ working_dir : " /vllm-workspace/.buildkite/lm-eval-harness"
1402+ source_file_dependencies :
1403+ - csrc/
1404+ - vllm/model_executor/layers/quantization
1405+ commands :
1406+ - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
1407+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
1408+
12951409# #### H200 test #####
12961410- label : Distributed Tests (H200) # optional
1411+ mirror_hardwares : [amdexperimental]
1412+ agent_pool : mi325_2
1413+ # grade: Blocking
12971414 gpu : h200
12981415 optional : true
12991416 working_dir : " /vllm-workspace/"
@@ -1305,6 +1422,7 @@ steps:
13051422 - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
13061423 - pytest -v -s tests/distributed/test_context_parallel.py
13071424 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
1425+ - pytest -v -s tests/v1/distributed/test_dbo.py
13081426
13091427# #### B200 test #####
13101428- label : Distributed Tests (B200) # optional
@@ -1315,6 +1433,7 @@ steps:
13151433 commands :
13161434 - pytest -v -s tests/distributed/test_context_parallel.py
13171435 - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
1436+ - pytest -v -s tests/v1/distributed/test_dbo.py
13181437
13191438# #### RL Integration Tests #####
13201439- label : Prime-RL Integration Test # 15min
@@ -1330,3 +1449,27 @@ steps:
13301449 - .buildkite/scripts/run-prime-rl-test.sh
13311450 commands :
13321451 - bash .buildkite/scripts/run-prime-rl-test.sh
1452+
1453+ - label : DeepSeek V2-Lite Accuracy
1454+ mirror_hardwares : [amdexperimental]
1455+ agent_pool : mi325_4
1456+ # grade: Blocking
1457+ timeout_in_minutes : 60
1458+ gpu : h100
1459+ optional : true
1460+ num_gpus : 4
1461+ working_dir : " /vllm-workspace"
1462+ commands :
1463+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
1464+
1465+ - label : Qwen3-30B-A3B-FP8-block Accuracy
1466+ mirror_hardwares : [amdexperimental]
1467+ agent_pool : mi325_4
1468+ # grade: Blocking
1469+ timeout_in_minutes : 60
1470+ gpu : h100
1471+ optional : true
1472+ num_gpus : 4
1473+ working_dir : " /vllm-workspace"
1474+ commands :
1475+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
0 commit comments