-
-
Notifications
You must be signed in to change notification settings - Fork 16.2k
[ROCm] [CI] Add new fusion test cases that are relevant to vLLM IR Ops #34307
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
230246d
1c9552a
bffe181
28ed03f
5628eb9
223fe34
119b4b0
218fcfb
befaba1
ca801a1
0b65174
be40a22
d8d0712
a03b94d
f58033a
eabee32
56ac061
b8c0bcd
9ef71e4
158ea2f
0997661
f432148
6891c60
1e8fe87
b81b0f9
0326f76
9001be5
ca222af
06b0aca
676184e
b566461
a7dd03f
24a142d
7e2cca4
089969c
5d05398
3af7195
4922c7a
84f0847
1aace95
3f0e188
64d3b63
c186a19
050544d
059205d
97102c3
071bdb7
0a42a79
3204c5c
de42cfb
a168f7b
53d253d
8374509
4b0cd59
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -610,6 +610,8 @@ steps: | |
| --ignore=lora/test_qwen3moe_tp.py | ||
| parallelism: 4 | ||
|
|
||
| ##### .buildkite/test_areas/pytorch.yaml ##### | ||
| # corresponds to .buildkite/test_areas/pytorch.yaml | ||
| - label: PyTorch Compilation Unit Tests # 15min | ||
| timeout_in_minutes: 30 | ||
| mirror_hardwares: [amdexperimental, amdproduction] | ||
|
|
@@ -627,6 +629,20 @@ steps: | |
| # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 | ||
| - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" | ||
|
|
||
| # corresponds to .buildkite/test_areas/pytorch.yaml | ||
| - label: PyTorch Compilation Passes Unit Tests | ||
| timeout_in_minutes: 20 | ||
| mirror_hardwares: [amdexperimental, amdproduction] | ||
| agent_pool: mi325_1 | ||
| source_file_dependencies: | ||
| - vllm/ | ||
| - tests/compile/passes | ||
| commands: | ||
| # TODO: clean up this comment if not needed. It is used to | ||
| # keep track of the tests changes during vLLM IR Ops refactoring. | ||
| # Use `find` to launch multiple instances of pytest. | ||
| - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" | ||
|
|
||
| - label: PyTorch Fullgraph Smoke Test # 15min | ||
| timeout_in_minutes: 30 | ||
| mirror_hardwares: [amdexperimental, amdproduction] | ||
|
|
@@ -1211,41 +1227,6 @@ steps: | |
| - pytest -v -s tests/kernels/moe/test_flashinfer.py | ||
| - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py | ||
|
|
||
| - label: Blackwell Fusion and Compile Tests # 30 min | ||
| timeout_in_minutes: 40 | ||
| working_dir: "/vllm-workspace/" | ||
| gpu: b200 | ||
| source_file_dependencies: | ||
| - csrc/quantization/fp4/ | ||
| - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py | ||
| - vllm/v1/attention/backends/flashinfer.py | ||
| - vllm/v1/worker/ | ||
| - vllm/v1/cudagraph_dispatcher.py | ||
| - vllm/compilation/ | ||
| # can affect pattern matching | ||
| - vllm/model_executor/layers/layernorm.py | ||
| - vllm/model_executor/layers/activation.py | ||
| - vllm/model_executor/layers/quantization/input_quant_fp8.py | ||
| - tests/compile/passes/test_fusion_attn.py | ||
| - tests/compile/passes/test_silu_mul_quant_fusion.py | ||
| - tests/compile/passes/distributed/test_fusion_all_reduce.py | ||
| - tests/compile/fullgraph/test_full_graph.py | ||
| commands: | ||
| - nvidia-smi | ||
| - pytest -v -s tests/compile/passes/test_fusion_attn.py | ||
| - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py | ||
| # this runner has 2 GPUs available even though num_gpus=2 is not set | ||
| - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py | ||
|
|
||
| # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time | ||
| # # Wrap with quotes to escape yaml | ||
| # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" | ||
| # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 | ||
| # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. | ||
|
|
||
| # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) | ||
| - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile | ||
|
|
||
| - label: Blackwell GPT-OSS Eval | ||
| timeout_in_minutes: 60 | ||
| working_dir: "/vllm-workspace/" | ||
|
|
@@ -1371,7 +1352,6 @@ steps: | |
| - pytest -v -s ./compile/test_wrapper.py | ||
| - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' | ||
| - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' | ||
| - pytest -v -s compile/correctness_e2e/test_sequence_parallel.py | ||
| - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown | ||
| - pytest -v -s v1/worker/test_worker_memory_snapshot.py | ||
|
|
||
|
|
@@ -1601,16 +1581,16 @@ steps: | |
| commands: | ||
| - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py | ||
| - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py | ||
| - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py | ||
| # TODO: this test is not supported on ROCm, there are aiter kernels for this. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you quote this issue (and let's make a sub-issue?): #25179
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Created |
||
| # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py | ||
| #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm | ||
| # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" | ||
| # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 | ||
| # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. | ||
|
|
||
| - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py | ||
| - pytest -v -s tests/distributed/test_context_parallel.py | ||
| - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization | ||
| - pytest -v -s tests/v1/distributed/test_dbo.py | ||
| # this test is not supported on ROCm | ||
| # - pytest -v -s tests/v1/distributed/test_dbo.py | ||
|
|
||
| ##### B200 test ##### | ||
| - label: Distributed Tests (B200) # optional | ||
|
|
@@ -1721,6 +1701,93 @@ steps: | |
| commands: | ||
| - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 | ||
|
|
||
| ##### .buildkite/test_areas/compile.yaml ##### | ||
| # Slowly setting up the tests so that it is also easier for the | ||
| # CI team to review and upstream to the pipelinev2. | ||
| # The following tests are important for vLLM IR Ops refactoring, | ||
| # which affects fusion passes on ROCm. So we have to | ||
| # enable them as as soon as possible. | ||
|
|
||
| ## TODO: Enable the test in this group | ||
| # # corresponds to .buildkite/test_areas/compile.yaml | ||
| # - label: Fusion and Compile Unit Tests (2xMI325 GPUs) | ||
| # timeout_in_minutes: 20 | ||
| # working_dir: "/vllm-workspace/" | ||
| # mirror_hardwares: [amdexperimental, amdproduction, tj] | ||
| # agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs | ||
| # source_file_dependencies: | ||
| # - csrc/quantization/fp4/ | ||
| # - vllm/model_executor/layers/quantization/ | ||
| # - vllm/model_executor/layers/layernorm.py | ||
| # - vllm/model_executor/layers/activation.py | ||
| # - vllm/model_executor/layers/attention/attention.py | ||
| # - vllm/v1/attention/backends/flashinfer.py | ||
| # - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes | ||
| # - tests/compile/test_fusion_attn.py | ||
| # - tests/compile/test_silu_mul_quant_fusion.py | ||
| # - tests/compile/distributed/test_fusion_all_reduce.py | ||
| # - tests/compile/fullgraph/test_full_graph.py | ||
| # commands: | ||
| # - rocm-smi | ||
| # # we run all backend tests on ROCm | ||
| # # These two tests are covered in "PyTorch Compilation Passes Unit Tests" | ||
| # # - "pytest -v -s tests/compile/passes/test_fusion_attn.py" | ||
| # # - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py" | ||
| # # TODO: this test is not supported on ROCm, there are aiter kernels for this. | ||
| # # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py | ||
| # # TODO: find out more details | ||
| # # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile | ||
|
|
||
| # corresponds to .buildkite/test_areas/compile.yaml | ||
| - label: Fusion E2E Quick (MI325) | ||
| timeout_in_minutes: 15 | ||
| working_dir: "/vllm-workspace/" | ||
| mirror_hardwares: [amdexperimental, amdproduction] | ||
| agent_pool: mi325_1 | ||
| num_devices: 1 | ||
| source_file_dependencies: | ||
| - csrc/quantization/ | ||
| - vllm/model_executor/ | ||
| - vllm/v1/attention/ | ||
| - vllm/compilation/ | ||
| - tests/compile/fusions_e2e/ | ||
| commands: | ||
| - rocm-smi | ||
| # Run all models and attn backends but only Inductor partition and native custom ops | ||
| - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" | ||
| # Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER | ||
| - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'" | ||
|
|
||
| # corresponds to .buildkite/test_areas/compile.yaml | ||
| - label: Fusion E2E Config Sweep (MI325) | ||
| timeout_in_minutes: 30 | ||
| working_dir: "/vllm-workspace/" | ||
| mirror_hardwares: [amdexperimental, amdproduction] | ||
| agent_pool: mi325_1 | ||
| num_devices: 1 | ||
| source_file_dependencies: | ||
| - csrc/quantization/ | ||
| - vllm/compilation/ | ||
| # can affect pattern matching | ||
| - vllm/model_executor/layers/layernorm.py | ||
| - vllm/model_executor/layers/activation.py | ||
| - vllm/model_executor/layers/attention/attention.py | ||
| - vllm/model_executor/layers/quantization/input_quant_fp8.py | ||
| - tests/compile/fusions_e2e/ | ||
| commands: | ||
| - rocm-smi | ||
| # Run just llama3 (fp8) for all config combinations | ||
| - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" | ||
|
tjtanaa marked this conversation as resolved.
|
||
|
|
||
| ## There are no ops on ROCm for these tests. | ||
| ## The test still passes but the logs are not useful. | ||
| ## fused ops just call torch.ops.symm_mem which | ||
| ## exists in ROCm even though they don't work | ||
| # - label: AsyncTP Correctness Tests (2xMI325 GPUs) | ||
| # - label: Fusion E2E TP2 Quick (MI325) | ||
| # - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) | ||
| # - label: Fusion E2E TP2 (MI325) | ||
| # - label: Sequence Parallel Correctness Tests (2xMI325 GPUs) | ||
|
|
||
|
|
||
| ##################################################################################################################################### | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
|
|
||
| class Matches(NamedTuple): | ||
| # simple pointwise | ||
| aiter_rms_quant_fusion: int = 0 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this ever a different number from
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will never be a different number from
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good, thanks! |
||
| rms_quant_fusion: int = 0 | ||
| act_quant_fusion: int = 0 | ||
| norm_rope_fusion: int = 0 | ||
|
|
@@ -82,6 +83,9 @@ def has_cuda_graph_wrapper_metadata() -> bool: | |
| ] | ||
|
|
||
| FUSION_LOG_PATTERNS: dict[str, re.Pattern] = { | ||
| "aiter_rms_quant_fusion": re.compile( | ||
| r"RocmAiterRMSNormQuantFusionPass Replaced (\d+) patterns" | ||
| ), | ||
| "rms_quant_fusion": re.compile(r"rms_quant_fusion.py:\d+] Replaced (\d+) patterns"), | ||
| "act_quant_fusion": re.compile(r"act_quant_fusion.py:\d+] Replaced (\d+) patterns"), | ||
| "norm_rope_fusion": re.compile( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are actually passing? I'm surprised
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yea. The test still passes but the logs are not useful. fused ops just call torch.ops.symm_mem which exists in ROCm even though they don't work.
The
tests/compile/fusions_e2e/test_tp2_async_tp.pyalso passes. But it doesn't mean this feature works on ROCm