diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py index 6cb9101a78b1..9a06eedd0f7d 100644 --- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py @@ -45,6 +45,7 @@ def __init__( beta_slow: int = 1, mscale: float = 1, mscale_all_dim: float = 0, + init_cache: bool = True, ) -> None: self.scaling_factor = scaling_factor self.extrapolation_factor = extrapolation_factor @@ -65,7 +66,13 @@ def __init__( and head_size in [64, 128, 256, 512] ) super().__init__( - head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + head_size, + rotary_dim, + max_position_embeddings, + base, + is_neox_style, + dtype, + init_cache=init_cache, ) def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: @@ -211,7 +218,9 @@ class DeepseekV4ScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding): """ def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + # Avoid compute cache repeatedly + kwargs.pop("init_cache", None) + super().__init__(*args, **kwargs, init_cache=False) cache_fp32 = self._compute_cos_sin_cache() self.register_buffer("cos_sin_cache", cache_fp32, persistent=False) diff --git a/vllm/model_executor/models/deepseek_v4.py b/vllm/model_executor/models/deepseek_v4.py index d41a8b666d33..baf28d04581a 100644 --- a/vllm/model_executor/models/deepseek_v4.py +++ b/vllm/model_executor/models/deepseek_v4.py @@ -1027,7 +1027,6 @@ def __init__( max_position=self.max_position_embeddings, rope_parameters=rope_parameters, is_neox_style=False, - dtype=config.torch_dtype, ) self.indexer = None