diff --git a/vllm/models/deepseek_v4/common/ops/cache_utils.py b/vllm/models/deepseek_v4/common/ops/cache_utils.py index efb936be6c07..ac66751e3111 100644 --- a/vllm/models/deepseek_v4/common/ops/cache_utils.py +++ b/vllm/models/deepseek_v4/common/ops/cache_utils.py @@ -366,7 +366,7 @@ def dequantize_and_gather_k_cache( ) -> None: if has_cutedsl(): # lazily import, otherwise some tests fail due to CUDA driver init failure. - from vllm.models.deepseek_v4.nvidia.ops import ( + from vllm.models.deepseek_v4.nvidia.ops.dequant_gather_k_cutedsl import ( dequantize_and_gather_k_cache_cutedsl, ) diff --git a/vllm/models/deepseek_v4/common/ops/fused_indexer_q.py b/vllm/models/deepseek_v4/common/ops/fused_indexer_q.py index e88fe1529cd6..d5aaf10feba4 100644 --- a/vllm/models/deepseek_v4/common/ops/fused_indexer_q.py +++ b/vllm/models/deepseek_v4/common/ops/fused_indexer_q.py @@ -346,7 +346,7 @@ def fused_indexer_q_rope_quant( ) if has_cutedsl(): # lazily import, otherwise some tests fail due to CUDA driver init failure. - from vllm.models.deepseek_v4.nvidia.ops import ( + from vllm.models.deepseek_v4.nvidia.ops.fused_indexer_q_cutedsl import ( fused_indexer_q_rope_quant_mxfp4_cutedsl, ) @@ -400,7 +400,7 @@ def fused_indexer_q_rope_quant( index_q_fp8 = torch.empty_like(index_q, dtype=torch.float8_e4m3fn) if has_cutedsl(): # lazily import, otherwise some tests fail due to CUDA driver init failure. - from vllm.models.deepseek_v4.nvidia.ops import ( + from vllm.models.deepseek_v4.nvidia.ops.fused_indexer_q_cutedsl import ( fused_indexer_q_rope_quant_fp8_cutedsl, ) diff --git a/vllm/models/deepseek_v4/compressor.py b/vllm/models/deepseek_v4/compressor.py index 3234faa5eb05..f36dc8f17629 100644 --- a/vllm/models/deepseek_v4/compressor.py +++ b/vllm/models/deepseek_v4/compressor.py @@ -338,7 +338,9 @@ def forward( if current_platform.is_cuda(): # NVIDIA GPUs. if self.head_dim == 512: - from .nvidia.ops import compress_norm_rope_store_cutedsl + from .nvidia.ops.sparse_attn_compress_cutedsl import ( + compress_norm_rope_store_cutedsl, + ) # Main compressor path. # Use a cutedsl kernel for better performance. diff --git a/vllm/models/deepseek_v4/nvidia/model.py b/vllm/models/deepseek_v4/nvidia/model.py index 974593a8d390..25f0a730fdbd 100644 --- a/vllm/models/deepseek_v4/nvidia/model.py +++ b/vllm/models/deepseek_v4/nvidia/model.py @@ -59,7 +59,7 @@ DeepseekV4MLAModules, DeepseekV4MultiHeadLatentAttentionWrapper, ) -from vllm.models.deepseek_v4.nvidia.ops import prepare_megamoe_inputs +from vllm.models.deepseek_v4.nvidia.ops.prepare_megamoe import prepare_megamoe_inputs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils.torch_utils import direct_register_custom_op diff --git a/vllm/models/deepseek_v4/nvidia/ops/__init__.py b/vllm/models/deepseek_v4/nvidia/ops/__init__.py index dca25345ea6f..20752bc15489 100644 --- a/vllm/models/deepseek_v4/nvidia/ops/__init__.py +++ b/vllm/models/deepseek_v4/nvidia/ops/__init__.py @@ -5,20 +5,9 @@ These modules import ``cutlass``/``cutedsl`` at module top level, so they must not be imported on non-CUDA platforms. Callers should gate on ``vllm.utils.import_utils.has_cutedsl()`` before importing from here. -""" - -from .dequant_gather_k_cutedsl import dequantize_and_gather_k_cache_cutedsl -from .fused_indexer_q_cutedsl import ( - fused_indexer_q_rope_quant_fp8_cutedsl, - fused_indexer_q_rope_quant_mxfp4_cutedsl, -) -from .prepare_megamoe import prepare_megamoe_inputs -from .sparse_attn_compress_cutedsl import compress_norm_rope_store_cutedsl -__all__ = [ - "compress_norm_rope_store_cutedsl", - "dequantize_and_gather_k_cache_cutedsl", - "fused_indexer_q_rope_quant_fp8_cutedsl", - "fused_indexer_q_rope_quant_mxfp4_cutedsl", - "prepare_megamoe_inputs", -] +This ``__init__`` deliberately imports nothing: re-exporting the cutedsl +modules here would eagerly ``import cutlass`` (initializing the CUDA driver) for +anyone who imports ``vllm.models.deepseek_v4``, breaking forked subprocesses. +Import the leaf modules directly under a ``has_cutedsl()``/``is_cuda()`` gate. +"""