vllm-project · LucasWilkinson · Jan 2, 2026 · Jan 1, 2026 · gemini-code-assist · Jan 1, 2026
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -357,8 +357,11 @@ def forward(
 
         if self.use_output:
             if output_shape is None:
+                # Handle both 2D [num_tokens, hidden] and
+                # 3D [num_tokens, heads, head_dim] query
+                num_tokens = query.shape[0]
                 output_shape = torch.Size(
-                    (*query.shape[:-1], self.num_heads * self.head_size_v)
+                    (num_tokens, self.num_heads * self.head_size_v)
                 )
             output_shape = output_shape if output_shape is not None else query.shape
             output = torch.empty(output_shape, dtype=output_dtype, device=query.device)

@@ -180,7 +180,19 @@ def get_fp8_moe_backend(
             scope="local",
         )
 
-    if envs.VLLM_USE_DEEP_GEMM and moe_use_deep_gemm and block_quant:
+    # Determine if we should use DeepGEMM (top-level enable switch)
+    # - If explicitly set by user, respect their choice
+    # - If not platform supports DeepGEMM, disable it
+    # This helps avoid warning messages on unsupported platforms.
+    use_deep_gemm = envs.VLLM_USE_DEEP_GEMM
+    if not is_deep_gemm_supported():
+        use_deep_gemm = False
+        logger.info_once(
+            "DeepGEMM is disabled because the platform does not support it.",
+            scope="local",
+        )
-    if not is_deep_gemm_supported():
-        use_deep_gemm = False
-        logger.info_once(
-            "DeepGEMM is disabled because the platform does not support it.",
-            scope="local",
-        )
+    if use_deep_gemm and not is_deep_gemm_supported():
+        use_deep_gemm = False
+        logger.info_once(
+            "DeepGEMM was requested but is disabled because the platform does not support it.",
+            scope="local",
+        )
-    if not is_deep_gemm_supported():
-        use_deep_gemm = False
-        logger.info_once(
-            "DeepGEMM is disabled because the platform does not support it.",
-            scope="local",
-        )
+    if use_deep_gemm and not is_deep_gemm_supported():
+        use_deep_gemm = False
+        logger.info_once(
+            "DeepGEMM was requested but is disabled because the platform does not support it.",
+            scope="local",
+        )
+
+    if use_deep_gemm and moe_use_deep_gemm and block_quant:
         if not has_deep_gemm():
             logger.warning_once(
                 "DeepGEMM backend requested but not available.", scope="local"