-
-
Notifications
You must be signed in to change notification settings - Fork 16.5k
[DSv4][Nvidia] SM12x DeepSeek V4 support #40991
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
40b9a04
a874655
04af72f
e21b9cc
a5ce0d7
1d6f5c4
e734ace
bd9fde7
0e186bc
f8950c3
0789bc9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
| from vllm.model_executor.models.deepseek_v4 import ( | ||
| DeepseekV4MegaMoEExperts, | ||
| _stage_deepseek_v4_mega_moe_inputs, | ||
| _use_deepseek_v4_mega_moe, | ||
| make_deepseek_v4_expert_params_mapping, | ||
| ) | ||
| from vllm.platforms import current_platform | ||
|
|
@@ -19,6 +20,52 @@ | |
| ) | ||
|
|
||
|
|
||
| def _make_mega_moe_config( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we solely testing backend selection then we don't need to include this into the tests |
||
| *, | ||
| enable_expert_parallel: bool = True, | ||
| moe_backend: str = "auto", | ||
| ): | ||
| return SimpleNamespace( | ||
| parallel_config=SimpleNamespace( | ||
| enable_expert_parallel=enable_expert_parallel | ||
| ), | ||
| kernel_config=SimpleNamespace(moe_backend=moe_backend), | ||
| ) | ||
|
|
||
|
|
||
| def test_deepseek_v4_mega_moe_selection_preserves_kernel_config(monkeypatch): | ||
| from vllm import envs | ||
|
|
||
| monkeypatch.delenv("VLLM_DEEPSEEK_V4_USE_MEGA_MOE", raising=False) | ||
| envs.disable_envs_cache() | ||
|
|
||
| assert _use_deepseek_v4_mega_moe( | ||
| _make_mega_moe_config(moe_backend="deep_gemm_mega_moe") | ||
| ) | ||
| assert not _use_deepseek_v4_mega_moe(_make_mega_moe_config()) | ||
| with pytest.raises(NotImplementedError, match="requires expert parallel"): | ||
| _use_deepseek_v4_mega_moe( | ||
| _make_mega_moe_config( | ||
| enable_expert_parallel=False, | ||
| moe_backend="deep_gemm_mega_moe", | ||
| ) | ||
| ) | ||
|
|
||
|
|
||
| def test_deepseek_v4_mega_moe_selection_env_override(monkeypatch): | ||
| from vllm import envs | ||
|
|
||
| monkeypatch.setenv("VLLM_DEEPSEEK_V4_USE_MEGA_MOE", "1") | ||
| envs.disable_envs_cache() | ||
| assert _use_deepseek_v4_mega_moe(_make_mega_moe_config()) | ||
|
|
||
| monkeypatch.setenv("VLLM_DEEPSEEK_V4_USE_MEGA_MOE", "0") | ||
| envs.disable_envs_cache() | ||
| assert not _use_deepseek_v4_mega_moe( | ||
| _make_mega_moe_config(moe_backend="deep_gemm_mega_moe") | ||
| ) | ||
|
|
||
|
|
||
| def test_deepseek_v4_mega_moe_expert_mapping(): | ||
| mapping = make_deepseek_v4_expert_params_mapping(2) | ||
|
|
||
|
|
@@ -46,7 +93,8 @@ def test_deepseek_v4_mega_moe_ue8m0_uint8_to_float(): | |
|
|
||
| def test_deepseek_v4_mega_moe_weight_loader_uses_ep_expert_ownership(): | ||
| vllm_config = SimpleNamespace( | ||
| scheduler_config=SimpleNamespace(max_num_batched_tokens=4) | ||
| scheduler_config=SimpleNamespace(max_num_batched_tokens=4), | ||
| compilation_config=SimpleNamespace(static_forward_context={}), | ||
| ) | ||
| experts = DeepseekV4MegaMoEExperts( | ||
| vllm_config, | ||
|
|
@@ -111,7 +159,10 @@ def test_deepseek_v4_mega_moe_weight_loader_uses_ep_expert_ownership(): | |
| reason="DeepSeek V4 MegaMoE fused input staging requires CUDA.", | ||
| ) | ||
| def test_deepseek_v4_mega_moe_fused_input_staging_is_bitwise_exact(): | ||
| from vllm.third_party.deep_gemm.utils import per_token_cast_to_fp8 | ||
| per_token_cast_to_fp8 = pytest.importorskip( | ||
| "deep_gemm.utils", | ||
| reason="DeepGEMM helper package is required for FP8 staging parity.", | ||
| ).per_token_cast_to_fp8 | ||
|
|
||
| device = torch.device("cuda") | ||
| num_tokens = 7 | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't think this is needed as well. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
|
|
||
| from vllm.model_executor.models.deepseek_v4 import DeepseekV4ForCausalLM | ||
| from vllm.model_executor.models.interfaces import supports_pp | ||
|
|
||
|
|
||
| def test_deepseek_v4_declares_pipeline_parallel_support(): | ||
| assert supports_pp(DeepseekV4ForCausalLM) |
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also no need to include this |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
|
|
||
| import pytest | ||
| import torch | ||
|
|
||
| import vllm.model_executor.parameter as parameter | ||
| from vllm.model_executor.layers.quantization.utils.fp8_utils import ( | ||
| create_fp8_scale_parameter, | ||
| ) | ||
| from vllm.model_executor.parameter import BlockQuantScaleParameter | ||
|
|
||
|
|
||
| @pytest.mark.skipif( | ||
| not hasattr(torch, "float8_e8m0fnu"), | ||
| reason="torch does not expose float8_e8m0fnu", | ||
| ) | ||
| def test_create_fp8_scale_parameter_initializes_e8m0(monkeypatch): | ||
| monkeypatch.setattr(parameter, "get_tensor_model_parallel_rank", lambda: 0) | ||
| monkeypatch.setattr(parameter, "get_tensor_model_parallel_world_size", lambda: 1) | ||
|
|
||
| scale = create_fp8_scale_parameter( | ||
| BlockQuantScaleParameter, | ||
| output_partition_sizes=[128], | ||
| input_size_per_partition=128, | ||
| block_size=[128, 128], | ||
| weight_loader=None, | ||
| scale_dtype=torch.float8_e8m0fnu, | ||
| ) | ||
|
|
||
| assert scale.dtype == torch.float8_e8m0fnu | ||
| raw_scale = scale.data.view(torch.uint8) | ||
| assert torch.equal(raw_scale, torch.zeros_like(raw_scale)) |
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also no need to include. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
|
|
||
| def test_mxfp4_e8m0_scale_loading_preserves_raw_bytes(): | ||
| from types import SimpleNamespace | ||
|
|
||
| import pytest | ||
| import torch | ||
|
|
||
| from vllm.model_executor.layers.fused_moe.layer import FusedMoE | ||
|
|
||
| e8m0_dtype = getattr(torch, "float8_e8m0fnu", None) | ||
| if e8m0_dtype is None: | ||
| pytest.skip("torch does not expose float8_e8m0fnu") | ||
|
|
||
| layer = object.__new__(FusedMoE) | ||
| layer.moe_config = SimpleNamespace(is_act_and_mul=True) | ||
|
|
||
| expert_data = torch.zeros((4, 2), dtype=torch.uint8) | ||
| loaded_scale = torch.tensor( | ||
| [[0.0078125, 0.015625], [0.5, 1.0]], | ||
| dtype=e8m0_dtype, | ||
| ) | ||
|
|
||
| layer._load_w13( | ||
| expert_data=expert_data, | ||
| shard_dim=0, | ||
| shard_id="w1", | ||
| loaded_weight=loaded_scale, | ||
| tp_rank=0, | ||
| ) | ||
|
|
||
| torch.testing.assert_close( | ||
| expert_data[:2], | ||
| loaded_scale.view(torch.uint8), | ||
| rtol=0, | ||
| atol=0, | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why do we want to change this? I do prefer keep the original name.