From f74c86126b457b1498f676e67260f4b128b8b3b7 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sat, 2 May 2026 20:27:21 -0400 Subject: [PATCH 1/2] swap Signed-off-by: Kyle Sayers --- vllm/model_executor/layers/deepseek_v4_attention.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/deepseek_v4_attention.py b/vllm/model_executor/layers/deepseek_v4_attention.py index 847c3eee55a8..2c050b1cefef 100644 --- a/vllm/model_executor/layers/deepseek_v4_attention.py +++ b/vllm/model_executor/layers/deepseek_v4_attention.py @@ -352,7 +352,7 @@ def compressor_kv_score() -> torch.Tensor: return torch.mm( hidden_states, compressor.fused_wkv_wgate.weight.T, - out_dtype=torch.float32, + out_dtype=torch.bfloat16, ) aux_fns[0] = compressor_kv_score @@ -369,7 +369,7 @@ def indexer_compressor_kv_score() -> torch.Tensor: return torch.mm( hidden_states, indexer.compressor.fused_wkv_wgate.weight.T, - out_dtype=torch.float32, + out_dtype=torch.bfloat16, ) aux_fns[1] = indexer_weights_proj From a65611946ea549bf94195672aa849b711f2d76db Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sun, 3 May 2026 00:14:07 -0400 Subject: [PATCH 2/2] remove dtype Signed-off-by: Kyle Sayers --- vllm/model_executor/layers/deepseek_v4_attention.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/layers/deepseek_v4_attention.py b/vllm/model_executor/layers/deepseek_v4_attention.py index 2c050b1cefef..759578f6eb84 100644 --- a/vllm/model_executor/layers/deepseek_v4_attention.py +++ b/vllm/model_executor/layers/deepseek_v4_attention.py @@ -352,7 +352,6 @@ def compressor_kv_score() -> torch.Tensor: return torch.mm( hidden_states, compressor.fused_wkv_wgate.weight.T, - out_dtype=torch.bfloat16, ) aux_fns[0] = compressor_kv_score @@ -369,7 +368,6 @@ def indexer_compressor_kv_score() -> torch.Tensor: return torch.mm( hidden_states, indexer.compressor.fused_wkv_wgate.weight.T, - out_dtype=torch.bfloat16, ) aux_fns[1] = indexer_weights_proj