Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm/models/deepseek_v4/common/ops/cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ def dequantize_and_gather_k_cache(
) -> None:
if has_cutedsl():
# lazily import, otherwise some tests fail due to CUDA driver init failure.
from vllm.models.deepseek_v4.nvidia.ops import (
from vllm.models.deepseek_v4.nvidia.ops.dequant_gather_k_cutedsl import (
dequantize_and_gather_k_cache_cutedsl,
)

Expand Down
4 changes: 2 additions & 2 deletions vllm/models/deepseek_v4/common/ops/fused_indexer_q.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def fused_indexer_q_rope_quant(
)
if has_cutedsl():
# lazily import, otherwise some tests fail due to CUDA driver init failure.
from vllm.models.deepseek_v4.nvidia.ops import (
from vllm.models.deepseek_v4.nvidia.ops.fused_indexer_q_cutedsl import (
fused_indexer_q_rope_quant_mxfp4_cutedsl,
)

Expand Down Expand Up @@ -400,7 +400,7 @@ def fused_indexer_q_rope_quant(
index_q_fp8 = torch.empty_like(index_q, dtype=torch.float8_e4m3fn)
if has_cutedsl():
# lazily import, otherwise some tests fail due to CUDA driver init failure.
from vllm.models.deepseek_v4.nvidia.ops import (
from vllm.models.deepseek_v4.nvidia.ops.fused_indexer_q_cutedsl import (
fused_indexer_q_rope_quant_fp8_cutedsl,
)

Expand Down
4 changes: 3 additions & 1 deletion vllm/models/deepseek_v4/compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,9 @@ def forward(
if current_platform.is_cuda():
# NVIDIA GPUs.
if self.head_dim == 512:
from .nvidia.ops import compress_norm_rope_store_cutedsl
from .nvidia.ops.sparse_attn_compress_cutedsl import (
compress_norm_rope_store_cutedsl,
)

# Main compressor path.
# Use a cutedsl kernel for better performance.
Expand Down
2 changes: 1 addition & 1 deletion vllm/models/deepseek_v4/nvidia/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
DeepseekV4MLAModules,
DeepseekV4MultiHeadLatentAttentionWrapper,
)
from vllm.models.deepseek_v4.nvidia.ops import prepare_megamoe_inputs
from vllm.models.deepseek_v4.nvidia.ops.prepare_megamoe import prepare_megamoe_inputs
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
from vllm.utils.torch_utils import direct_register_custom_op
Expand Down
21 changes: 5 additions & 16 deletions vllm/models/deepseek_v4/nvidia/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,9 @@
These modules import ``cutlass``/``cutedsl`` at module top level, so they must
not be imported on non-CUDA platforms. Callers should gate on
``vllm.utils.import_utils.has_cutedsl()`` before importing from here.
"""

from .dequant_gather_k_cutedsl import dequantize_and_gather_k_cache_cutedsl
from .fused_indexer_q_cutedsl import (
fused_indexer_q_rope_quant_fp8_cutedsl,
fused_indexer_q_rope_quant_mxfp4_cutedsl,
)
from .prepare_megamoe import prepare_megamoe_inputs
from .sparse_attn_compress_cutedsl import compress_norm_rope_store_cutedsl

__all__ = [
"compress_norm_rope_store_cutedsl",
"dequantize_and_gather_k_cache_cutedsl",
"fused_indexer_q_rope_quant_fp8_cutedsl",
"fused_indexer_q_rope_quant_mxfp4_cutedsl",
"prepare_megamoe_inputs",
]
This ``__init__`` deliberately imports nothing: re-exporting the cutedsl
modules here would eagerly ``import cutlass`` (initializing the CUDA driver) for
anyone who imports ``vllm.models.deepseek_v4``, breaking forked subprocesses.
Import the leaf modules directly under a ``has_cutedsl()``/``is_cuda()`` gate.
"""
Loading