-
-
Notifications
You must be signed in to change notification settings - Fork 16.6k
[ROCm][CI] Upgrade ROCm quantized MoE coverage #40943
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
AndreasKaratzas
wants to merge
3
commits into
vllm-project:main
Choose a base branch
from
ROCm:akaratza_rocm_quantized_moe
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,234 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
| """ROCm gfx950 quantized-MoE initialization coverage. | ||
|
|
||
| This file mirrors the intent of ``test_blackwell_moe.py`` using ROCm-native | ||
| features instead of CUDA-only backends: | ||
|
|
||
| def test_mi355_moe(): | ||
| print("TODO: add tests for Mi355 MoE quantization") | ||
| - public Neural Magic compressed-tensors MoE models | ||
| - public Quark INT8 MoE smoke coverage | ||
| - ROCm Quark MXFP4/BF16 MoE with explicit ``aiter`` and ``triton`` backends | ||
| - ROCm GPT-OSS MXFP4/FP8 MoE in the same shape the repo already advertises | ||
| - ROCm DeepSeek Quark MXFP4/UINT8 MoE with explicit backend coverage | ||
| """ | ||
|
|
||
| import importlib.metadata | ||
| import importlib.util | ||
| from typing import Any | ||
|
|
||
| import huggingface_hub | ||
| import pytest | ||
| from packaging import version | ||
|
|
||
| from tests.utils import RemoteOpenAIServer | ||
| from vllm.platforms import current_platform | ||
| from vllm.platforms.rocm import on_mi3xx | ||
|
|
||
| pytestmark = pytest.mark.skipif( | ||
| not current_platform.is_rocm() or not on_mi3xx(), | ||
| reason="MI300/MI350 ROCm only", | ||
| ) | ||
|
|
||
|
|
||
| def _has_quark_mxfp4_support() -> bool: | ||
| if importlib.util.find_spec("quark") is None: | ||
| return False | ||
| try: | ||
| return version.parse(importlib.metadata.version("amd-quark")) >= version.parse( | ||
| "0.9.0" | ||
| ) | ||
| except importlib.metadata.PackageNotFoundError: | ||
| return False | ||
|
|
||
|
|
||
| QUARK_MXFP4_AVAILABLE = _has_quark_mxfp4_support() | ||
| QUARK_AVAILABLE = importlib.util.find_spec("quark") is not None | ||
|
|
||
| HF_OVERRIDE_TEXT = { | ||
| "num_layers": 4, | ||
| "num_hidden_layers": 4, | ||
| } | ||
| ROCM_ATTENTION_BACKENDS = [ | ||
| pytest.param("ROCM_ATTN", id="rocm_attn"), | ||
| pytest.param("ROCM_AITER_UNIFIED_ATTN", id="rocm_aiter_unified_attn"), | ||
| ] | ||
|
|
||
|
|
||
| def _has_huggingface_access(repo_id: str) -> bool: | ||
| try: | ||
| huggingface_hub.list_repo_refs(repo_id) | ||
| return True | ||
| except huggingface_hub.errors.RepositoryNotFoundError: | ||
| return False | ||
| except huggingface_hub.errors.HfHubHTTPError: | ||
| return False | ||
|
|
||
|
|
||
| def _require_repo_access(repo_id: str) -> None: | ||
| if not _has_huggingface_access(repo_id): | ||
| pytest.skip(f"Read access to huggingface.co/{repo_id} is required.") | ||
|
|
||
|
|
||
| def _can_initialize( | ||
| model: str, | ||
| *, | ||
| hf_overrides: dict[str, Any] | None = None, | ||
| extra_args: list[str] | None = None, | ||
| env: dict[str, str] | None = None, | ||
| ) -> None: | ||
| server_args = [ | ||
| "--max-model-len", | ||
| "2048", | ||
| "--max-num-batched-tokens", | ||
| "256", | ||
| "--max-num-seqs", | ||
| "1", | ||
| "--load-format", | ||
| "dummy", | ||
| "--trust-remote-code", | ||
| "--enforce-eager", | ||
| "--disable-uvicorn-access-log", | ||
| *(extra_args or []), | ||
| ] | ||
|
|
||
| with RemoteOpenAIServer( | ||
| model, | ||
| server_args, | ||
| env_dict=env, | ||
| max_wait_seconds=1500, | ||
| override_hf_configs=hf_overrides, | ||
| ) as server: | ||
| client = server.get_client() | ||
| completion = client.completions.create( | ||
| model=model, | ||
| prompt=["Hello, World!"], | ||
| temperature=0, | ||
| max_tokens=2, | ||
| ) | ||
| print(completion) | ||
| assert completion.choices[0].text is not None | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("attention_backend", ROCM_ATTENTION_BACKENDS) | ||
| def test_nm_qwen15_w4a16_moe_initializes_across_rocm_attention_backends( | ||
| attention_backend: str, | ||
| ): | ||
| """A public Neural Magic W4A16 MoE model should initialize with both ROCm | ||
| attention backends that are meaningful on MI3xx.""" | ||
| repo_id = "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16" | ||
| _require_repo_access(repo_id) | ||
| _can_initialize( | ||
| repo_id, | ||
| hf_overrides=HF_OVERRIDE_TEXT, | ||
| extra_args=["--attention-backend", attention_backend], | ||
| ) | ||
|
|
||
|
|
||
| def test_nm_mixtral_w4a16_moe_initializes(): | ||
| """A second public Neural Magic MoE family should initialize on ROCm.""" | ||
| repo_id = "nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized" | ||
| _require_repo_access(repo_id) | ||
| _can_initialize( | ||
| repo_id, | ||
| hf_overrides=HF_OVERRIDE_TEXT, | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.skipif( | ||
| not QUARK_AVAILABLE, | ||
| reason="quark package is required for ROCm Quark MoE tests", | ||
| ) | ||
| def test_tiny_quark_int8_moe_initializes(): | ||
| """A small public Quark INT8 MoE model should initialize on MI3xx.""" | ||
| _can_initialize( | ||
| "nameistoken/tiny-qwen3-moe-w8a8-int8-quark", | ||
| hf_overrides=HF_OVERRIDE_TEXT, | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.skipif( | ||
| not QUARK_MXFP4_AVAILABLE, | ||
| reason="amd-quark>=0.9.0 is required for ROCm MXFP4 MoE tests", | ||
| ) | ||
| @pytest.mark.parametrize("moe_backend", ["aiter", "triton"]) | ||
| def test_gptoss_rocm_quark_mxfp4_bf16_moe_backends_initialize( | ||
| moe_backend: str, | ||
| ): | ||
| """The ROCm GPT-OSS MXFP4/BF16 Quark MoE path should initialize with the | ||
| two real ROCm MoE backends exposed at the CLI.""" | ||
| repo_id = "amd/gpt-oss-20b-w-mxfp4-a-bf16" | ||
| _require_repo_access(repo_id) | ||
| _can_initialize( | ||
| repo_id, | ||
| hf_overrides=HF_OVERRIDE_TEXT, | ||
| extra_args=[ | ||
| "--attention-backend", | ||
| "ROCM_AITER_UNIFIED_ATTN", | ||
| "--moe-backend", | ||
| moe_backend, | ||
| "--tokenizer", | ||
| "openai/gpt-oss-20b", | ||
| "--tensor-parallel-size", | ||
| "1", | ||
| ], | ||
| env={"VLLM_ROCM_USE_AITER": "1"} if moe_backend == "aiter" else None, | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.skipif( | ||
| not current_platform.supports_fp8(), | ||
| reason="FP8 not supported on this hardware", | ||
| ) | ||
| @pytest.mark.skipif( | ||
| not QUARK_MXFP4_AVAILABLE, | ||
| reason="amd-quark>=0.9.0 is required for ROCm MXFP4 MoE tests", | ||
| ) | ||
| def test_gptoss_rocm_quark_mxfp4_fp8_moe_initializes(): | ||
| """The ROCm GPT-OSS MXFP4/FP8 Quark MoE path should initialize in the same | ||
| form the repo already advertises for ROCm evals.""" | ||
| repo_id = "amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-FP8-KV-FP8" | ||
| _require_repo_access(repo_id) | ||
| _can_initialize( | ||
| repo_id, | ||
| hf_overrides=HF_OVERRIDE_TEXT, | ||
| extra_args=[ | ||
| "--attention-backend", | ||
| "ROCM_AITER_UNIFIED_ATTN", | ||
| "--tokenizer", | ||
| "openai/gpt-oss-20b", | ||
| "--tensor-parallel-size", | ||
| "1", | ||
| ], | ||
| env={"VLLM_ROCM_USE_AITER": "1"}, | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.skipif( | ||
| not QUARK_MXFP4_AVAILABLE, | ||
| reason="amd-quark>=0.9.0 is required for ROCm MXFP4 MoE tests", | ||
| ) | ||
| @pytest.mark.parametrize( | ||
| "moe_backend", | ||
| [ | ||
| pytest.param(None, id="auto"), | ||
| pytest.param("aiter", id="aiter"), | ||
| pytest.param("triton", id="triton"), | ||
| ], | ||
| ) | ||
| def test_deepseek_rocm_quark_mxfp4_uint8_moe_backends_initialize( | ||
| moe_backend: str | None, | ||
| ): | ||
| """The ROCm DeepSeek MXFP4/UINT8 Quark MoE path should initialize across | ||
| the real ROCm backend choices for the MXFP4 MoE oracle.""" | ||
| repo_id = "amd/DeepSeek-R1-WMXFP4-AMXFP4-Scale-UINT8-MoE-Quant" | ||
| _require_repo_access(repo_id) | ||
| _can_initialize( | ||
| repo_id, | ||
| hf_overrides=HF_OVERRIDE_TEXT, | ||
| extra_args=[ | ||
| "--tensor-parallel-size", | ||
| "1", | ||
| *([] if moe_backend is None else ["--moe-backend", moe_backend]), | ||
| ], | ||
| env={"VLLM_ROCM_USE_AITER": "1"} if moe_backend == "aiter" else None, | ||
| ) | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
max_wait_secondsis set to 1500 (25 minutes), which is exceptionally high for a test using thedummyload format. While large models can take time to initialize, dummy loading (which skips disk I/O for weights) should typically complete within a few minutes. Such a long timeout can lead to significant CI blockage if a regression causes the server to hang or fail silently. Consider reducing this to a more reasonable value (e.g., 300-600 seconds).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This directly mirrors blackwell test. However, we might indeed not need that value there. At the same time, it is a timeout so there is no real check here I think.