From 756a331548299d2d2172a9380265a89376acab62 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 24 Feb 2026 18:34:51 -0600 Subject: [PATCH 01/10] Enabling some B200-specific tests on MI355. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 131 ++++-------------- .../configs/Qwen3-Next-FP8-EP2_MI355.yaml | 11 ++ tests/evals/gsm8k/configs/models-mi355.txt | 5 + 3 files changed, 44 insertions(+), 103 deletions(-) create mode 100644 tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml create mode 100644 tests/evals/gsm8k/configs/models-mi355.txt diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index ffdf4b83c0e2..6805c38f209f 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -463,17 +463,6 @@ steps: - pytest -v -s v1/determinism/test_batch_invariance.py - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py -- label: V1 Test attention (B200) # 10min - timeout_in_minutes: 30 - gpu: b200 - source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention - commands: - - pytest -v -s v1/attention - - label: V1 Test others (CPU) # 5 mins mirror_hardwares: [amdexperimental, amdproduction, amdtentative] agent_pool: mi325_1 @@ -1124,48 +1113,6 @@ steps: # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper -- label: Blackwell Test # 21 min - timeout_in_minutes: 30 - working_dir: "/vllm-workspace/" - gpu: b200 - # optional: true - source_file_dependencies: - - csrc/quantization/fp4/ - - csrc/attention/mla/ - - csrc/quantization/cutlass_w8a8/moe/ - - vllm/model_executor/layers/fused_moe/cutlass_moe.py - - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/v1/attention/backends/mla/cutlass_mla.py - - vllm/v1/attention/backends/mla/flashinfer_mla.py - - vllm/v1/attention/selector.py - - vllm/platforms/cuda.py - commands: - - nvidia-smi - - python3 examples/offline_inference/basic/chat.py - # Attention - # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - - pytest -v -s tests/kernels/attention/test_attention_selector.py - - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py - # Quantization - - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py - - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py - - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - - pytest -v -s tests/kernels/moe/test_flashinfer.py - - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - - label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 working_dir: "/vllm-workspace/" @@ -1232,16 +1179,6 @@ steps: commands: - pytest -s -v tests/quantization/test_blackwell_moe.py -- label: Blackwell LM Eval Small Models - timeout_in_minutes: 120 - gpu: b200 - optional: true # run on nightlies - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt - ##### 1 GPU test ##### ##### multi gpus test ##### @@ -1647,16 +1584,6 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 -- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) - timeout_in_minutes: 60 - gpu: b200 - optional: true - num_gpus: 2 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - - - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] @@ -2052,19 +1979,6 @@ steps: # TODO: Add the "V1 Test attetion (MI300)" test group -- label: V1 Test attention (H100) # 10min - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - timeout_in_minutes: 30 - gpu: h100 - source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention - commands: - - pytest -v -s v1/attention - - label: Batch Invariance Tests (H100) # 10min mirror_hardwares: [amdexperimental] agent_pool: mi355_1 @@ -2081,6 +1995,8 @@ steps: - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py - label: V1 Test attention (B200) # 10min + mirror_hardwares: [amdexperimental, amdmi355] + agent_pool: mi355_1 timeout_in_minutes: 30 gpu: b200 source_file_dependencies: @@ -2705,7 +2621,9 @@ steps: # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper -- label: Blackwell Test # 21 min +- label: Blackwell Test (MI355) # 21 min + mirror_hardwares: [amdexperimental, amdmi355] + agent_pool: mi355_1 timeout_in_minutes: 30 working_dir: "/vllm-workspace/" gpu: b200 @@ -2724,7 +2642,7 @@ steps: - vllm/v1/attention/selector.py - vllm/platforms/cuda.py commands: - - nvidia-smi + # rocm-smi - python3 examples/offline_inference/basic/chat.py # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 @@ -2815,13 +2733,15 @@ steps: - label: Blackwell LM Eval Small Models timeout_in_minutes: 120 + mirror_hardwares: [amdexperimental, amdproduction, amdmi355] + agent_pool: mi355_1 gpu: b200 optional: true # run on nightlies source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi355.txt ##### 1 GPU test ##### ##### multi gpus test ##### @@ -3198,18 +3118,9 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 -- label: Qwen3-30B-A3B-FP8-block Accuracy (H100) - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_4 - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - -- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) +- label: Qwen3-30B-A3B-FP8-block Accuracy (B200/MI355) + mirror_hardwares: [amdexperimental, amdproduction, amdmi355] + agent_pool: mi325_2 timeout_in_minutes: 60 gpu: b200 optional: true @@ -3218,7 +3129,6 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] @@ -3227,4 +3137,19 @@ steps: num_gpus: 4 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 \ No newline at end of file + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + + - label: Attention Benchmarks Smoke Test (B200/MI355) + device: b200 + mirror_hardwares: [amdexperimental, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/" + timeout_in_minutes: 10 + source_file_dependencies: + - benchmarks/attention_benchmarks/ + - vllm/v1/attention/ + commands: + - python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1 + diff --git a/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml new file mode 100644 index 000000000000..b935aa27ee7d --- /dev/null +++ b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml @@ -0,0 +1,11 @@ +model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8" +accuracy_threshold: 0.85 +num_questions: 1319 +num_fewshot: 5 +server_args: >- + --max-model-len 4096 + --tensor-parallel-size 2 + --enable-expert-parallel + --async-scheduling +env: + VLLM_USE_FLASHINFER_MOE_FP8: "0" diff --git a/tests/evals/gsm8k/configs/models-mi355.txt b/tests/evals/gsm8k/configs/models-mi355.txt new file mode 100644 index 000000000000..f1122008f597 --- /dev/null +++ b/tests/evals/gsm8k/configs/models-mi355.txt @@ -0,0 +1,5 @@ +Qwen3-0.6B-FP8.yaml +Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml +Qwen1.5-MoE-W4A16-CT.yaml +DeepSeek-V2-Lite-Instruct-FP8.yaml +Qwen3-Next-FP8-EP2_MI355.yaml From af9ebc6c89a82905ae3661c63a3d9780d9060436 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 24 Feb 2026 18:53:14 -0600 Subject: [PATCH 02/10] Error fix. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 6805c38f209f..ed3a494a271d 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -3139,7 +3139,7 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 - - label: Attention Benchmarks Smoke Test (B200/MI355) +- label: Attention Benchmarks Smoke Test (B200/MI355) device: b200 mirror_hardwares: [amdexperimental, amdmi355] agent_pool: mi355_2 From 147e2b8903b2434f31cb97ef70732d82d55c8822 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 24 Feb 2026 20:04:16 -0600 Subject: [PATCH 03/10] Fixing a routing error. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index ed3a494a271d..fc93a1cdb844 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -3120,7 +3120,7 @@ steps: - label: Qwen3-30B-A3B-FP8-block Accuracy (B200/MI355) mirror_hardwares: [amdexperimental, amdproduction, amdmi355] - agent_pool: mi325_2 + agent_pool: mi35dd5_2 timeout_in_minutes: 60 gpu: b200 optional: true From 7c8025d7caf63be906c8362dfc5c4229956a0618 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 24 Feb 2026 20:06:51 -0600 Subject: [PATCH 04/10] . Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index fc93a1cdb844..159d192f6d0f 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -3120,7 +3120,7 @@ steps: - label: Qwen3-30B-A3B-FP8-block Accuracy (B200/MI355) mirror_hardwares: [amdexperimental, amdproduction, amdmi355] - agent_pool: mi35dd5_2 + agent_pool: mi355_2 timeout_in_minutes: 60 gpu: b200 optional: true From 64bf37486808f16366d1b144b251ca4f5b719a1b Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Thu, 26 Feb 2026 11:54:27 -0600 Subject: [PATCH 05/10] Redirecting "Blackwell LM Eval Small Models" to mi355_2 Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 159d192f6d0f..85d477b6a70f 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -2734,7 +2734,7 @@ steps: - label: Blackwell LM Eval Small Models timeout_in_minutes: 120 mirror_hardwares: [amdexperimental, amdproduction, amdmi355] - agent_pool: mi355_1 + agent_pool: mi355_2 gpu: b200 optional: true # run on nightlies source_file_dependencies: From b019c0918da4ed8ae960d921c36ab6d955ea143b Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Fri, 27 Feb 2026 11:56:37 -0600 Subject: [PATCH 06/10] Skipping incompatible FA3. Signed-off-by: Alexei V. Ivanov --- tests/kernels/attention/test_attention_selector.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index f021df56c05b..783afec3405c 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -302,6 +302,10 @@ def test_invalid_backend(): ("FLEX_ATTENTION", None, False), # Flex does not support ], ) +@pytest.mark.skipif( + current_platform.is_rocm(), + reason="Attention backend FA3 is not supported on ROCm. This test can't succeed.", +) def test_per_head_quant_scales_backend_selection( backend_name: str, flash_attn_version: int | None, should_succeed: bool ): From beb26d5018b5cad071fc6083f2e758536e219eec Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 3 Mar 2026 15:52:11 -0600 Subject: [PATCH 07/10] Commenting void tests. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 85d477b6a70f..a318d6484db5 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -2647,23 +2647,24 @@ steps: # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - pytest -v -s tests/kernels/attention/test_attention_selector.py - - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py - # Quantization - - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py - - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py - - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - - pytest -v -s tests/kernels/moe/test_flashinfer.py - - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py + + #- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + #- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + #- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + #- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + ## Quantization + #- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + #- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + #- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + #- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + #- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + #- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + #- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + #- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py + #- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + #- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + #- pytest -v -s tests/kernels/moe/test_flashinfer.py + #- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 From 3f3b696107ff429d9a432dd57b5ff75957a3b5dc Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 3 Mar 2026 19:38:07 -0600 Subject: [PATCH 08/10] removing empty line as per PR comment. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index a318d6484db5..25f9471a15b8 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -2646,8 +2646,7 @@ steps: - python3 examples/offline_inference/basic/chat.py # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - - pytest -v -s tests/kernels/attention/test_attention_selector.py - + - pytest -v -s tests/kernels/attention/test_attention_selector.py #- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' #- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py #- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py From bce34859512b381697957a501d9b0507b49dd8b1 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 3 Mar 2026 19:40:45 -0600 Subject: [PATCH 09/10] Removing non-AMD-specific env. var as per PR comment. Signed-off-by: Alexei V. Ivanov --- tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml index b935aa27ee7d..302abf97b110 100644 --- a/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml +++ b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml @@ -7,5 +7,3 @@ server_args: >- --tensor-parallel-size 2 --enable-expert-parallel --async-scheduling -env: - VLLM_USE_FLASHINFER_MOE_FP8: "0" From 27f5d0ec868b02aa2f88cd5055d432e16d7c2d6d Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 4 Mar 2026 12:47:55 -0600 Subject: [PATCH 10/10] responding to comments Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 8ead842e8096..b0a672682560 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -2797,7 +2797,7 @@ steps: - vllm/v1/attention/selector.py - vllm/platforms/cuda.py commands: - # rocm-smi + rocm-smi - python3 examples/offline_inference/basic/chat.py # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 @@ -3288,6 +3288,7 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 + - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy timeout_in_minutes: 60 mirror_hardwares: [amdexperimental]