Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ mkdocs.yaml @hmellor

# DeepseekV4-specific files
/vllm/models/deepseek_v4 @zyongye
/vllm/v1/attention/ops/deepseek_v4_ops @zyongye
/vllm/model_executor/layers/sparse_attn_indexer.py @zyongye

# Mistral-specific files
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/core/test_fused_q_kv_rmsnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
import pytest
import torch

from vllm.models.deepseek_v4.common.ops import fused_q_kv_rmsnorm
from vllm.platforms import current_platform
from vllm.v1.attention.ops.deepseek_v4_ops import fused_q_kv_rmsnorm

pytestmark = pytest.mark.skipif(
not current_platform.is_cuda_alike(),
Expand Down
4 changes: 2 additions & 2 deletions tests/kernels/test_compressor_kv_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
import torch

from vllm import _custom_ops as ops
from vllm.v1.attention.ops.deepseek_v4_ops import (
from vllm.models.deepseek_v4.common.ops import (
dequantize_and_gather_k_cache,
quantize_and_insert_k_cache,
)
from vllm.v1.attention.ops.deepseek_v4_ops.fused_compress_quant_cache import (
from vllm.models.deepseek_v4.common.ops.fused_compress_quant_cache import (
_fused_kv_compress_norm_rope_insert_indexer_attn,
_fused_kv_compress_norm_rope_insert_indexer_mxfp4_attn,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import pytest
import torch

from vllm.v1.attention.ops.deepseek_v4_ops import (
from vllm.models.deepseek_v4.common.ops import (
dequantize_and_gather_k_cache,
quantize_and_insert_k_cache,
)
Expand Down
6 changes: 2 additions & 4 deletions tests/kernels/test_fused_indexer_q_rope_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8,
)
from vllm.models.deepseek_v4.common.ops import fused_indexer_q_rope_quant
from vllm.utils.import_utils import has_cutedsl
from vllm.v1.attention.ops.deepseek_v4_ops.fused_indexer_q import (
fused_indexer_q_rope_quant,
)

HEAD_DIM = 128
ROPE_DIM = 64
Expand Down Expand Up @@ -156,7 +154,7 @@ def test_fused_indexer_q_rope_quant_matches_unfused(
# by patching the dispatcher's has_cutedsl() binding to return False.
cutedsl_patch = (
mock.patch(
"vllm.v1.attention.ops.deepseek_v4_ops.fused_indexer_q.has_cutedsl",
"vllm.models.deepseek_v4.common.ops.fused_indexer_q.has_cutedsl",
return_value=False,
)
if not use_cutedsl
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/test_fused_inv_rope_fp8_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import pytest
import torch

from vllm.v1.attention.ops.deepseek_v4_ops import fused_inv_rope_fp8_quant
from vllm.models.deepseek_v4.common.ops import fused_inv_rope_fp8_quant

# -- Default dimensions matching DeepSeek V3/V4 --------------------------
HEAD_DIM = 512
Expand Down
6 changes: 3 additions & 3 deletions vllm/models/deepseek_v4/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@
ReplicatedLinear,
)
from vllm.model_executor.layers.sparse_attn_indexer import SparseAttnIndexer
from vllm.utils.deep_gemm import fp8_einsum
from vllm.utils.torch_utils import direct_register_custom_op
from vllm.v1.attention.ops.deepseek_v4_ops import (
from vllm.models.deepseek_v4.common.ops import (
combine_topk_swa_indices,
compute_global_topk_indices_and_lens,
dequantize_and_gather_k_cache,
fused_indexer_q_rope_quant,
fused_inv_rope_fp8_quant,
fused_q_kv_rmsnorm,
)
from vllm.utils.deep_gemm import fp8_einsum
from vllm.utils.torch_utils import direct_register_custom_op
from vllm.v1.attention.ops.rocm_aiter_mla_sparse import rocm_inv_rope_einsum

if TYPE_CHECKING:
Expand Down
2 changes: 2 additions & 0 deletions vllm/models/deepseek_v4/common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,9 @@ def dequantize_and_gather_k_cache(
) -> None:
if has_cutedsl():
# lazily import, otherwise some tests fail due to CUDA driver init failure.
from .dequant_gather_k_cutedsl import dequantize_and_gather_k_cache_cutedsl
from vllm.models.deepseek_v4.nvidia.ops.dequant_gather_k_cutedsl import (
dequantize_and_gather_k_cache_cutedsl,
)

dequantize_and_gather_k_cache_cutedsl(
out, k_cache, seq_lens, gather_lens, block_table, block_size, offset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def fused_indexer_q_rope_quant(
)
if has_cutedsl():
# lazily import, otherwise some tests fail due to CUDA driver init failure.
from .fused_indexer_q_cutedsl import (
from vllm.models.deepseek_v4.nvidia.ops.fused_indexer_q_cutedsl import (
fused_indexer_q_rope_quant_mxfp4_cutedsl,
)

Expand Down Expand Up @@ -400,7 +400,7 @@ def fused_indexer_q_rope_quant(
index_q_fp8 = torch.empty_like(index_q, dtype=torch.float8_e4m3fn)
if has_cutedsl():
# lazily import, otherwise some tests fail due to CUDA driver init failure.
from .fused_indexer_q_cutedsl import (
from vllm.models.deepseek_v4.nvidia.ops.fused_indexer_q_cutedsl import (
fused_indexer_q_rope_quant_fp8_cutedsl,
)

Expand Down
16 changes: 6 additions & 10 deletions vllm/models/deepseek_v4/compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@
from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (
MergedColumnParallelLinear,
from vllm.model_executor.layers.linear import MergedColumnParallelLinear
from vllm.models.deepseek_v4.common.ops.fused_compress_quant_cache import (
_fused_kv_compress_norm_rope_insert_indexer_attn,
_fused_kv_compress_norm_rope_insert_indexer_mxfp4_attn,
_fused_kv_compress_norm_rope_insert_sparse_attn,
)
from vllm.models.deepseek_v4.common.ops.fused_indexer_q import MXFP4_BLOCK_SIZE
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
from vllm.v1.attention.backend import (
Expand All @@ -23,14 +27,6 @@
CommonAttentionMetadata,
MultipleOf,
)
from vllm.v1.attention.ops.deepseek_v4_ops.fused_compress_quant_cache import (
_fused_kv_compress_norm_rope_insert_indexer_attn,
_fused_kv_compress_norm_rope_insert_indexer_mxfp4_attn,
_fused_kv_compress_norm_rope_insert_sparse_attn,
)
from vllm.v1.attention.ops.deepseek_v4_ops.fused_indexer_q import (
MXFP4_BLOCK_SIZE,
)
from vllm.v1.kv_cache_interface import (
KVCacheSpec,
MLAAttentionSpec,
Expand Down
8 changes: 8 additions & 0 deletions vllm/models/deepseek_v4/nvidia/ops/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""NVIDIA-only (cutedsl/cutlass) kernels for DeepSeek V4.

These modules import ``cutlass``/``cutedsl`` at module top level, so they must
not be imported on non-CUDA platforms. Callers should gate on
``vllm.utils.import_utils.has_cutedsl()`` before importing from here.
"""
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from cutlass.cute.nvgpu import cpasync
from quack.compile_utils import make_fake_tensor

from vllm.v1.attention.ops.deepseek_v4_ops.cutedsl_utils import (
from vllm.models.deepseek_v4.nvidia.ops.cutedsl_utils import (
_bf16x2_mul,
_fp8x4_to_bf16x4,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from cutlass import BFloat16, Float32, Int64, Uint8, Uint32, const_expr
from quack.compile_utils import make_fake_tensor

from vllm.v1.attention.ops.deepseek_v4_ops.cutedsl_utils import (
from vllm.models.deepseek_v4.nvidia.ops.cutedsl_utils import (
_bf16x2_abs,
_bf16x2_max,
_bf16x2_to_fp32,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import torch

from vllm.forward_context import get_forward_context
from vllm.models.deepseek_v4.common.ops import dequantize_and_gather_k_cache
from vllm.triton_utils import tl, triton
from vllm.v1.attention.backend import (
AttentionLayer,
Expand All @@ -22,7 +23,6 @@
DeepseekSparseSWAMetadata,
DeepseekSparseSWAMetadataBuilder,
)
from vllm.v1.attention.ops.deepseek_v4_ops import dequantize_and_gather_k_cache
from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
build_ragged_indices_from_dense,
rocm_sparse_attn_decode,
Expand Down
Loading