diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 6eda7bce9586..a0da0902efd0 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -499,17 +499,6 @@ steps: - pytest -v -s v1/determinism/test_batch_invariance.py - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py -- label: V1 Test attention (B200) # 10min - timeout_in_minutes: 30 - gpu: b200 - source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention - commands: - - pytest -v -s v1/attention - - label: V1 Test others (CPU) # 5 mins mirror_hardwares: [amdexperimental, amdproduction, amdtentative] agent_pool: mi325_1 @@ -1185,47 +1174,40 @@ steps: # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper -- label: Blackwell Test # 21 min - timeout_in_minutes: 30 +- label: Blackwell Fusion and Compile Tests # 30 min + timeout_in_minutes: 40 working_dir: "/vllm-workspace/" gpu: b200 - # optional: true source_file_dependencies: - csrc/quantization/fp4/ - - csrc/attention/mla/ - - csrc/quantization/cutlass_w8a8/moe/ - - vllm/model_executor/layers/fused_moe/cutlass_moe.py - - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/v1/attention/backends/flashinfer.py - - vllm/v1/attention/backends/mla/cutlass_mla.py - - vllm/v1/attention/backends/mla/flashinfer_mla.py - - vllm/v1/attention/selector.py - - vllm/platforms/cuda.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/passes/test_fusion_attn.py + - tests/compile/passes/test_silu_mul_quant_fusion.py + - tests/compile/passes/distributed/test_fusion_all_reduce.py + - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi - - python3 examples/offline_inference/basic/chat.py - # Attention - # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - - pytest -v -s tests/kernels/attention/test_attention_selector.py - - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py - # Quantization - - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py - - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py - - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - - pytest -v -s tests/kernels/moe/test_flashinfer.py - - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py + - pytest -v -s tests/compile/passes/test_fusion_attn.py + - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py + # this runner has 2 GPUs available even though num_gpus=2 is not set + - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py + + # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time + # # Wrap with quotes to escape yaml + # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 + # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. + + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 @@ -1258,16 +1240,6 @@ steps: commands: - pytest -s -v tests/quantization/test_blackwell_moe.py -- label: Blackwell LM Eval Small Models - timeout_in_minutes: 120 - gpu: b200 - optional: true # run on nightlies - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt - ##### 1 GPU test ##### ##### multi gpus test ##### @@ -1681,16 +1653,6 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 -- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) - timeout_in_minutes: 60 - gpu: b200 - optional: true - num_gpus: 2 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - - - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction] @@ -2176,19 +2138,6 @@ steps: # TODO: Add the "V1 Test attention (MI300)" test group -- label: V1 Test attention (H100) # 10min - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - timeout_in_minutes: 30 - gpu: h100 - source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention - commands: - - pytest -v -s v1/attention - - label: Batch Invariance Tests (H100) # 10min mirror_hardwares: [amdexperimental] agent_pool: mi355_1 @@ -2205,6 +2154,8 @@ steps: - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py - label: V1 Test attention (B200) # 10min + mirror_hardwares: [amdexperimental, amdmi355] + agent_pool: mi355_1 timeout_in_minutes: 30 gpu: b200 source_file_dependencies: @@ -2829,7 +2780,9 @@ steps: # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper -- label: Blackwell Test # 21 min +- label: Blackwell Test (MI355) # 21 min + mirror_hardwares: [amdexperimental, amdmi355] + agent_pool: mi355_1 timeout_in_minutes: 30 working_dir: "/vllm-workspace/" gpu: b200 @@ -2848,28 +2801,28 @@ steps: - vllm/v1/attention/selector.py - vllm/platforms/cuda.py commands: - - nvidia-smi + rocm-smi - python3 examples/offline_inference/basic/chat.py # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - - pytest -v -s tests/kernels/attention/test_attention_selector.py - - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py - # Quantization - - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py - - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py - - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - - pytest -v -s tests/kernels/moe/test_flashinfer.py - - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py + - pytest -v -s tests/kernels/attention/test_attention_selector.py + #- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + #- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + #- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + #- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + ## Quantization + #- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + #- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + #- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + #- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + #- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + #- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + #- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + #- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py + #- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + #- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + #- pytest -v -s tests/kernels/moe/test_flashinfer.py + #- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 @@ -2939,13 +2892,15 @@ steps: - label: Blackwell LM Eval Small Models timeout_in_minutes: 120 + mirror_hardwares: [amdexperimental, amdproduction, amdmi355] + agent_pool: mi355_2 gpu: b200 optional: true # run on nightlies source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi355.txt ##### 1 GPU test ##### ##### multi gpus test ##### @@ -3328,18 +3283,9 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 -- label: Qwen3-30B-A3B-FP8-block Accuracy (H100) - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_4 - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - -- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) +- label: Qwen3-30B-A3B-FP8-block Accuracy (B200/MI355) + mirror_hardwares: [amdexperimental, amdproduction, amdmi355] + agent_pool: mi355_2 timeout_in_minutes: 60 gpu: b200 optional: true @@ -3358,3 +3304,18 @@ steps: working_dir: "/vllm-workspace" commands: - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + +- label: Attention Benchmarks Smoke Test (B200/MI355) + device: b200 + mirror_hardwares: [amdexperimental, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/" + timeout_in_minutes: 10 + source_file_dependencies: + - benchmarks/attention_benchmarks/ + - vllm/v1/attention/ + commands: + - python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1 + diff --git a/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml new file mode 100644 index 000000000000..302abf97b110 --- /dev/null +++ b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml @@ -0,0 +1,9 @@ +model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8" +accuracy_threshold: 0.85 +num_questions: 1319 +num_fewshot: 5 +server_args: >- + --max-model-len 4096 + --tensor-parallel-size 2 + --enable-expert-parallel + --async-scheduling diff --git a/tests/evals/gsm8k/configs/models-mi355.txt b/tests/evals/gsm8k/configs/models-mi355.txt new file mode 100644 index 000000000000..f1122008f597 --- /dev/null +++ b/tests/evals/gsm8k/configs/models-mi355.txt @@ -0,0 +1,5 @@ +Qwen3-0.6B-FP8.yaml +Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml +Qwen1.5-MoE-W4A16-CT.yaml +DeepSeek-V2-Lite-Instruct-FP8.yaml +Qwen3-Next-FP8-EP2_MI355.yaml diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 6b6cae34f22b..7ac1951fe6fb 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -343,6 +343,10 @@ def test_auto_backend_selection_behavior(): ("FLEX_ATTENTION", None, False), # Flex does not support ], ) +@pytest.mark.skipif( + current_platform.is_rocm(), + reason="Attention backend FA3 is not supported on ROCm. This test can't succeed.", +) def test_per_head_quant_scales_backend_selection( backend_name: str, flash_attn_version: int | None, should_succeed: bool ):