Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 129 additions & 45 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -570,9 +570,11 @@ steps:
--ignore=lora/test_qwen3moe_tp.py
parallelism: 4

##### .buildkite/test_areas/pytorch.yaml #####
# corresponds to .buildkite/test_areas/pytorch.yaml
- label: PyTorch Compilation Unit Tests # 15min
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental, amdproduction, tj]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
Expand All @@ -586,10 +588,16 @@ steps:
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
# TODO: clean up this comment if not needed. It is used to
# keep track of the tests changes during vLLM IR Ops refactoring.
# Use `find` to launch multiple instances of pytest.
- "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"


# corresponds to .buildkite/test_areas/pytorch.yaml
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdproduction]
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental, amdproduction, tj]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
Expand All @@ -603,17 +611,18 @@ steps:
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"

- label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 40
mirror_hardwares: [amdexperimental, amdproduction]
# corresponds to .buildkite/test_areas/pytorch.yaml
- label: PyTorch Fullgraph # 27min
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdproduction, tj]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
- "pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'"
# # Limit to no custom ops to reduce running time
# # Wrap with quotes to escape yaml and avoid starting -k string with a -
# - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
Expand Down Expand Up @@ -1176,41 +1185,6 @@ steps:
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py

- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/passes/test_fusion_attn.py
- tests/compile/passes/test_silu_mul_quant_fusion.py
- tests/compile/passes/distributed/test_fusion_all_reduce.py
- tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/passes/test_fusion_attn.py
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py

# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# # Wrap with quotes to escape yaml
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
Expand Down Expand Up @@ -1334,7 +1308,6 @@ steps:
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py

Expand Down Expand Up @@ -1558,17 +1531,20 @@ steps:
num_gpus: 2
commands:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
# ================= 24 passed, 11 warnings in 192.85s (0:03:12) ==================
- pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
# ================== 48 passed, 8 warnings in 386.41s (0:06:26) ==================
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
# ======================== 8 skipped, 9 warnings in 2.08s ========================
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
# - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
# ======================== 4 passed, 3 warnings in 30.45s ========================
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
- pytest -v -s tests/v1/distributed/test_dbo.py
# ======================== 2 skipped, 3 warnings in 1.97s ========================

##### B200 test #####
- label: Distributed Tests (B200) # optional
Expand Down Expand Up @@ -1692,3 +1668,111 @@ steps:
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

##### .buildkite/test_areas/compile.yaml #####
# Slowly setting up the tests so that it is also easier for the
# CI team to review and upstream to the pipelinev2.
# The following tests are important for vLLM IR Ops refactoring,
# which affects fusion passes on ROCm. So we have to
# enable them as as soon as possible.

# corresponds to .buildkite/test_areas/compile.yaml
- label: Sequence Parallel Correctness Tests (2xMI325 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
mirror_hardwares: [amdexperimental, amdproduction, tj]
agent_pool: mi325_2
num_devices: 2
source_file_dependencies:
- vllm/model_executor/layers/
- vllm/compilation/
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- tests/compile/correctness_e2e/test_sequence_parallel.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- "pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py"

# corresponds to .buildkite/test_areas/compile.yaml
- label: AsyncTP Correctness Tests (2xMI325 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
mirror_hardwares: [amdexperimental, amdproduction, tj]
agent_pool: mi325_2
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- "pytest -v -s tests/compile/correctness_e2e/test_async_tp.py"

# corresponds to .buildkite/test_areas/compile.yaml
- label: Fusion and Compile Unit Tests (2xMI325 GPUs)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
mirror_hardwares: [amdexperimental, amdproduction, tj]
agent_pool: mi325_2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/fullgraph/test_full_graph.py
commands:
- rocm-smi
# we run all backend tests on ROCm
- "pytest -v -s tests/compile/passes/test_fusion_attn.py"
- "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py"
- "pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile"
# TODO: this test is not supported on ROCm, there are aiter kernels for this.
# - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py


# corresponds to .buildkite/test_areas/compile.yaml
- label: Fusion E2E Quick (MI325)
timeout_in_minutes: 15
working_dir: "/vllm-workspace/"
mirror_hardwares: [amdexperimental, amdproduction, tj]
agent_pool: mi325_1
num_devices: 1
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- rocm-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# TODO: Qwen uses group quantization which the pattern matcher on ROCm is not supported yet.
# - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"

# corresponds to .buildkite/test_areas/compile.yaml
- label: Fusion E2E Config Sweep (MI325)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
mirror_hardwares: [amdexperimental, amdproduction, tj]
agent_pool: mi325_1
num_devices: 1
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- rocm-smi
# Run just llama3 (fp8) for all config combinations
- "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'"
- "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'"

59 changes: 42 additions & 17 deletions tests/compile/fullgraph/test_full_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm._aiter_ops import rocm_aiter_ops
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer
Expand Down Expand Up @@ -194,29 +195,53 @@ def test_custom_compile_config(
)
@pytest.mark.parametrize(
"model, backend",
[
("Qwen/Qwen2-0.5B", None), # Standard attention model
(
"deepseek-ai/DeepSeek-V2-Lite",
AttentionBackendEnum.FLASHINFER_MLA,
), # MLA (Multi-head Latent Attention) model
],
(
[
("Qwen/Qwen2-0.5B", None), # Standard attention model
(
"deepseek-ai/DeepSeek-V2-Lite",
AttentionBackendEnum.FLASHINFER_MLA,
), # MLA (Multi-head Latent Attention) model
]
if current_platform.is_cuda()
else [
# TRITON_MLA does not support FP8 KV cache
# So we can skip the standard attention model
# test.
(
"deepseek-ai/DeepSeek-V2-Lite",
AttentionBackendEnum.ROCM_AITER_MLA,
), # MLA (Multi-head Latent Attention) model
(
"deepseek-ai/DeepSeek-V2-Lite",
AttentionBackendEnum.ROCM_AITER_TRITON_MLA,
), # MLA (Multi-head Latent Attention) model
]
),
)
def test_fp8_kv_scale_compile(
compilation_mode: int,
model: str,
backend: AttentionBackendEnum | None,
monkeypatch: pytest.MonkeyPatch,
):
model_kwargs = {
"quantization": "fp8",
"kv_cache_dtype": "fp8_e4m3",
"calculate_kv_scales": True,
"max_model_len": 512,
}
if backend:
model_kwargs["attention_config"] = {"backend": backend.name}

run_model(compilation_mode, model, **model_kwargs)
with monkeypatch.context() as m:
model_kwargs = {
"quantization": "fp8",
"kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8",
"calculate_kv_scales": True,
"max_model_len": 512,
}
if backend:
model_kwargs["attention_config"] = {"backend": backend.name}
if current_platform.is_rocm():
m.setenv("VLLM_ROCM_USE_AITER", "1")
# Disable Aiter MOE as some shapes are not supported
m.setenv("VLLM_ROCM_USE_AITER_MOE", "0")

rocm_aiter_ops.refresh_env_variables()

run_model(compilation_mode, model, **model_kwargs)


def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
Expand Down
Loading