vllm-project · youkaichao · Oct 8, 2025 · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -143,7 +143,6 @@
     VLLM_TPU_USING_PATHWAYS: bool = False
     VLLM_USE_DEEP_GEMM: bool = True
     VLLM_USE_DEEP_GEMM_E8M0: bool = True
-    VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False
     VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
     VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
     VLLM_USE_FLASHINFER_MOE_FP16: bool = False
@@ -1061,11 +1060,6 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(
         int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))
     ),
-    # TODO(wentao): unify the two E8M0 flags after verifying the correctness.
-    # Whether to use E8M0 scaling when DeepGEMM is used on Hopper GPUs.
-    "VLLM_USE_DEEP_GEMM_E8M0_HOPPER": lambda: bool(
-        int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))
-    ),
     # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
     # JIT all the required kernels before model execution so there is no
     # JIT'ing in the hot-path. However, this warmup increases the engine
@@ -1440,7 +1434,6 @@ def compute_hash() -> str:
         "VLLM_DISABLED_KERNELS",
         "VLLM_USE_DEEP_GEMM",
         "VLLM_USE_DEEP_GEMM_E8M0",
-        "VLLM_USE_DEEP_GEMM_E8M0_HOPPER",
         "VLLM_USE_TRTLLM_FP4_GEMM",
         "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
         "VLLM_USE_FLASHINFER_MOE_FP16",

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
@@ -628,25 +628,25 @@ def get_config(
 
     if quantization_config is not None:
         config.quantization_config = quantization_config
-        # auto-enable DeepGEMM UE8M0 on Hopper if model config requests it
+        # auto-enable DeepGEMM UE8M0 if model config requests it
         scale_fmt = quantization_config.get("scale_fmt", None)
         if scale_fmt in ("ue8m0",):
-            if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"):
-                os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1"
+            if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0"):
+                os.environ["VLLM_USE_DEEP_GEMM_E8M0"] = "1"
                 logger.info_once(
                     (
                         "Detected quantization_config.scale_fmt=%s; "
-                        "enabling Hopper UE8M0."
+                        "enabling UE8M0 for DeepGEMM."
                     ),
                     scale_fmt,
                 )
-            elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
+            elif not envs.VLLM_USE_DEEP_GEMM_E8M0:
                 logger.warning_once(
                     (
                         "Model config requests UE8M0 "
                         "(quantization_config.scale_fmt=%s), but "
-                        "VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
-                        "Hopper UE8M0 disabled."
+                        "VLLM_USE_DEEP_GEMM_E8M0=0 is set; "
+                        "UE8M0 for DeepGEMM disabled."
                     ),
                     scale_fmt,
                 )

diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
@@ -29,12 +29,7 @@ def is_deep_gemm_supported() -> bool:
         current_platform.is_device_capability(90)
         or current_platform.is_device_capability(100)
     )
-    return (
-        envs.VLLM_USE_DEEP_GEMM
-        and has_deep_gemm()
-        and is_supported_arch
-        and not envs.VLLM_USE_FLASHINFER_MOE_FP8
-    )
+    return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch
 
 
 @functools.cache
@@ -58,15 +53,8 @@ def is_deep_gemm_e8m0_used() -> bool:
         logger.info_once("DeepGEMM E8M0 disabled: FlashInfer MOE is enabled.")
         return False
 
-    if current_platform.is_device_capability(100) and envs.VLLM_USE_DEEP_GEMM_E8M0:
-        logger.info_once("DeepGEMM E8M0 enabled on Blackwell GPU.")
-        return True
-
-    if (
-        current_platform.is_device_capability(90)
-        and envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER
-    ):
-        logger.info_once("DeepGEMM E8M0 enabled on Hopper GPU.")
+    if envs.VLLM_USE_DEEP_GEMM_E8M0:
+        logger.info_once("DeepGEMM E8M0 enabled on current platform.")
         return True
 
     logger.info_once("DeepGEMM E8M0 disabled on current configuration.")