Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 71 additions & 110 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -499,17 +499,6 @@ steps:
- pytest -v -s v1/determinism/test_batch_invariance.py
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py

- label: V1 Test attention (B200) # 10min
timeout_in_minutes: 30
gpu: b200
source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention

- label: V1 Test others (CPU) # 5 mins
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
Expand Down Expand Up @@ -1185,47 +1174,40 @@ steps:
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

- label: Blackwell Test # 21 min
timeout_in_minutes: 30
- label: Blackwell Fusion and Compile Tests # 30 min
Comment thread
Alexei-V-Ivanov-AMD marked this conversation as resolved.
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- csrc/attention/mla/
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/v1/attention/selector.py
- vllm/platforms/cuda.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/passes/test_fusion_attn.py
- tests/compile/passes/test_silu_mul_quant_fusion.py
- tests/compile/passes/distributed/test_fusion_all_reduce.py
- tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- pytest -v -s tests/compile/passes/test_fusion_attn.py
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py

# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# # Wrap with quotes to escape yaml
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
Expand Down Expand Up @@ -1258,16 +1240,6 @@ steps:
commands:
- pytest -s -v tests/quantization/test_blackwell_moe.py

- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt

##### 1 GPU test #####
##### multi gpus test #####

Expand Down Expand Up @@ -1681,16 +1653,6 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
gpu: b200
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1


- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental, amdproduction]
Expand Down Expand Up @@ -2176,19 +2138,6 @@ steps:

# TODO: Add the "V1 Test attention (MI300)" test group

- label: V1 Test attention (H100) # 10min
mirror_hardwares: [amdexperimental]
agent_pool: mi355_1
timeout_in_minutes: 30
gpu: h100
source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention

- label: Batch Invariance Tests (H100) # 10min
mirror_hardwares: [amdexperimental]
agent_pool: mi355_1
Expand All @@ -2205,6 +2154,8 @@ steps:
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py

- label: V1 Test attention (B200) # 10min
mirror_hardwares: [amdexperimental, amdmi355]
agent_pool: mi355_1
timeout_in_minutes: 30
gpu: b200
source_file_dependencies:
Expand Down Expand Up @@ -2829,7 +2780,9 @@ steps:
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

- label: Blackwell Test # 21 min
- label: Blackwell Test (MI355) # 21 min
mirror_hardwares: [amdexperimental, amdmi355]
agent_pool: mi355_1
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
gpu: b200
Expand All @@ -2848,28 +2801,28 @@ steps:
- vllm/v1/attention/selector.py
- vllm/platforms/cuda.py
commands:
- nvidia-smi
rocm-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- pytest -v -s tests/kernels/attention/test_attention_selector.py
#- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
#- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
#- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
#- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
## Quantization
#- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
#- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
#- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
#- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
#- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
#- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
#- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
#- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
#- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
#- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
#- pytest -v -s tests/kernels/moe/test_flashinfer.py
#- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py

- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
Expand Down Expand Up @@ -2939,13 +2892,15 @@ steps:

- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
agent_pool: mi355_2
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi355.txt

##### 1 GPU test #####
##### multi gpus test #####
Expand Down Expand Up @@ -3328,18 +3283,9 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi355_4
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200/MI355)
mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
agent_pool: mi355_2
timeout_in_minutes: 60
gpu: b200
optional: true
Expand All @@ -3358,3 +3304,18 @@ steps:
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

- label: Attention Benchmarks Smoke Test (B200/MI355)
device: b200
Comment thread
Alexei-V-Ivanov-AMD marked this conversation as resolved.
mirror_hardwares: [amdexperimental, amdmi355]
agent_pool: mi355_2
num_gpus: 2
optional: true
working_dir: "/vllm-workspace/"
timeout_in_minutes: 10
source_file_dependencies:
- benchmarks/attention_benchmarks/
- vllm/v1/attention/
commands:
- python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1

9 changes: 9 additions & 0 deletions tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
accuracy_threshold: 0.85
num_questions: 1319
num_fewshot: 5
server_args: >-
--max-model-len 4096
--tensor-parallel-size 2
--enable-expert-parallel
--async-scheduling
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/models-mi355.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Qwen3-0.6B-FP8.yaml
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-CT.yaml
DeepSeek-V2-Lite-Instruct-FP8.yaml
Qwen3-Next-FP8-EP2_MI355.yaml
4 changes: 4 additions & 0 deletions tests/kernels/attention/test_attention_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,10 @@ def test_auto_backend_selection_behavior():
("FLEX_ATTENTION", None, False), # Flex does not support
],
)
@pytest.mark.skipif(
current_platform.is_rocm(),
reason="Attention backend FA3 is not supported on ROCm. This test can't succeed.",
)
def test_per_head_quant_scales_backend_selection(
backend_name: str, flash_attn_version: int | None, should_succeed: bool
):
Expand Down