vllm-project · WoosukKwon · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -154,7 +154,6 @@ mkdocs.yaml @hmellor
 
 # DeepseekV4-specific files
 /vllm/models/deepseek_v4 @zyongye
-/vllm/v1/attention/ops/deepseek_v4_ops @zyongye
 /vllm/model_executor/layers/sparse_attn_indexer.py @zyongye
 
 # Mistral-specific files

@@ -13,8 +13,8 @@
 import pytest
 import torch
 
+from vllm.models.deepseek_v4.common.ops import fused_q_kv_rmsnorm
 from vllm.platforms import current_platform
-from vllm.v1.attention.ops.deepseek_v4_ops import fused_q_kv_rmsnorm
 
 pytestmark = pytest.mark.skipif(
     not current_platform.is_cuda_alike(),

@@ -17,11 +17,11 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.v1.attention.ops.deepseek_v4_ops import (
+from vllm.models.deepseek_v4.common.ops import (
     dequantize_and_gather_k_cache,
     quantize_and_insert_k_cache,
 )
-from vllm.v1.attention.ops.deepseek_v4_ops.fused_compress_quant_cache import (
+from vllm.models.deepseek_v4.common.ops.fused_compress_quant_cache import (
     _fused_kv_compress_norm_rope_insert_indexer_attn,
     _fused_kv_compress_norm_rope_insert_indexer_mxfp4_attn,
 )

@@ -19,7 +19,7 @@
 import pytest
 import torch
 
-from vllm.v1.attention.ops.deepseek_v4_ops import (
+from vllm.models.deepseek_v4.common.ops import (
     dequantize_and_gather_k_cache,
     quantize_and_insert_k_cache,
 )

@@ -23,10 +23,8 @@
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8,
 )
+from vllm.models.deepseek_v4.common.ops import fused_indexer_q_rope_quant
 from vllm.utils.import_utils import has_cutedsl
-from vllm.v1.attention.ops.deepseek_v4_ops.fused_indexer_q import (
-    fused_indexer_q_rope_quant,
-)
 
 HEAD_DIM = 128
 ROPE_DIM = 64
@@ -156,7 +154,7 @@ def test_fused_indexer_q_rope_quant_matches_unfused(
     # by patching the dispatcher's has_cutedsl() binding to return False.
     cutedsl_patch = (
         mock.patch(
-            "vllm.v1.attention.ops.deepseek_v4_ops.fused_indexer_q.has_cutedsl",
+            "vllm.models.deepseek_v4.common.ops.fused_indexer_q.has_cutedsl",
             return_value=False,
         )
         if not use_cutedsl

@@ -21,7 +21,7 @@
 import pytest
 import torch
 
-from vllm.v1.attention.ops.deepseek_v4_ops import fused_inv_rope_fp8_quant
+from vllm.models.deepseek_v4.common.ops import fused_inv_rope_fp8_quant
 
 # -- Default dimensions matching DeepSeek V3/V4 --------------------------
 HEAD_DIM = 512

@@ -19,16 +19,16 @@
     ReplicatedLinear,
 )
 from vllm.model_executor.layers.sparse_attn_indexer import SparseAttnIndexer
-from vllm.utils.deep_gemm import fp8_einsum
-from vllm.utils.torch_utils import direct_register_custom_op
-from vllm.v1.attention.ops.deepseek_v4_ops import (
+from vllm.models.deepseek_v4.common.ops import (
     combine_topk_swa_indices,
     compute_global_topk_indices_and_lens,
     dequantize_and_gather_k_cache,
     fused_indexer_q_rope_quant,
     fused_inv_rope_fp8_quant,
     fused_q_kv_rmsnorm,
 )
+from vllm.utils.deep_gemm import fp8_einsum
+from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.ops.rocm_aiter_mla_sparse import rocm_inv_rope_einsum
 
 if TYPE_CHECKING:

@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
@@ -366,7 +366,9 @@ def dequantize_and_gather_k_cache(
 ) -> None:
     if has_cutedsl():
         # lazily import, otherwise some tests fail due to CUDA driver init failure.
-        from .dequant_gather_k_cutedsl import dequantize_and_gather_k_cache_cutedsl
+        from vllm.models.deepseek_v4.nvidia.ops.dequant_gather_k_cutedsl import (
+            dequantize_and_gather_k_cache_cutedsl,
+        )
 
         dequantize_and_gather_k_cache_cutedsl(
             out, k_cache, seq_lens, gather_lens, block_table, block_size, offset

@@ -346,7 +346,7 @@ def fused_indexer_q_rope_quant(
         )
         if has_cutedsl():
             # lazily import, otherwise some tests fail due to CUDA driver init failure.
-            from .fused_indexer_q_cutedsl import (
+            from vllm.models.deepseek_v4.nvidia.ops.fused_indexer_q_cutedsl import (
                 fused_indexer_q_rope_quant_mxfp4_cutedsl,
             )
 
@@ -400,7 +400,7 @@ def fused_indexer_q_rope_quant(
     index_q_fp8 = torch.empty_like(index_q, dtype=torch.float8_e4m3fn)
     if has_cutedsl():
         # lazily import, otherwise some tests fail due to CUDA driver init failure.
-        from .fused_indexer_q_cutedsl import (
+        from vllm.models.deepseek_v4.nvidia.ops.fused_indexer_q_cutedsl import (
             fused_indexer_q_rope_quant_fp8_cutedsl,
         )
 

@@ -11,9 +11,13 @@
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (
-    MergedColumnParallelLinear,
+from vllm.model_executor.layers.linear import MergedColumnParallelLinear
+from vllm.models.deepseek_v4.common.ops.fused_compress_quant_cache import (
+    _fused_kv_compress_norm_rope_insert_indexer_attn,
+    _fused_kv_compress_norm_rope_insert_indexer_mxfp4_attn,
+    _fused_kv_compress_norm_rope_insert_sparse_attn,
 )
+from vllm.models.deepseek_v4.common.ops.fused_indexer_q import MXFP4_BLOCK_SIZE
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.v1.attention.backend import (
@@ -23,14 +27,6 @@
     CommonAttentionMetadata,
     MultipleOf,
 )
-from vllm.v1.attention.ops.deepseek_v4_ops.fused_compress_quant_cache import (
-    _fused_kv_compress_norm_rope_insert_indexer_attn,
-    _fused_kv_compress_norm_rope_insert_indexer_mxfp4_attn,
-    _fused_kv_compress_norm_rope_insert_sparse_attn,
-)
-from vllm.v1.attention.ops.deepseek_v4_ops.fused_indexer_q import (
-    MXFP4_BLOCK_SIZE,
-)
 from vllm.v1.kv_cache_interface import (
     KVCacheSpec,
     MLAAttentionSpec,

@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""NVIDIA-only (cutedsl/cutlass) kernels for DeepSeek V4.
+
+These modules import ``cutlass``/``cutedsl`` at module top level, so they must
+not be imported on non-CUDA platforms. Callers should gate on
+``vllm.utils.import_utils.has_cutedsl()`` before importing from here.
+"""
@@ -11,7 +11,7 @@
 from cutlass.cute.nvgpu import cpasync
 from quack.compile_utils import make_fake_tensor
 
-from vllm.v1.attention.ops.deepseek_v4_ops.cutedsl_utils import (
+from vllm.models.deepseek_v4.nvidia.ops.cutedsl_utils import (
     _bf16x2_mul,
     _fp8x4_to_bf16x4,
 )

@@ -9,7 +9,7 @@
 from cutlass import BFloat16, Float32, Int64, Uint8, Uint32, const_expr
 from quack.compile_utils import make_fake_tensor
 
-from vllm.v1.attention.ops.deepseek_v4_ops.cutedsl_utils import (
+from vllm.models.deepseek_v4.nvidia.ops.cutedsl_utils import (
     _bf16x2_abs,
     _bf16x2_max,
     _bf16x2_to_fp32,

@@ -7,6 +7,7 @@
 import torch
 
 from vllm.forward_context import get_forward_context
+from vllm.models.deepseek_v4.common.ops import dequantize_and_gather_k_cache
 from vllm.triton_utils import tl, triton
 from vllm.v1.attention.backend import (
     AttentionLayer,
@@ -22,7 +23,6 @@
     DeepseekSparseSWAMetadata,
     DeepseekSparseSWAMetadataBuilder,
 )
-from vllm.v1.attention.ops.deepseek_v4_ops import dequantize_and_gather_k_cache
 from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
     build_ragged_indices_from_dense,
     rocm_sparse_attn_decode,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# SPDX-License-Identifier: Apache-2.0
		# SPDX-FileCopyrightText: Copyright contributors to the vLLM project