Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@
VLLM_TPU_USING_PATHWAYS: bool = False
VLLM_USE_DEEP_GEMM: bool = True
VLLM_USE_DEEP_GEMM_E8M0: bool = True
VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False
VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
VLLM_USE_FLASHINFER_MOE_FP16: bool = False
Expand Down Expand Up @@ -1061,11 +1060,6 @@ def get_vllm_port() -> Optional[int]:
"VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(
int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))
),
# TODO(wentao): unify the two E8M0 flags after verifying the correctness.
# Whether to use E8M0 scaling when DeepGEMM is used on Hopper GPUs.
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER": lambda: bool(
int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))
),
# DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
# JIT all the required kernels before model execution so there is no
# JIT'ing in the hot-path. However, this warmup increases the engine
Expand Down Expand Up @@ -1440,7 +1434,6 @@ def compute_hash() -> str:
"VLLM_DISABLED_KERNELS",
"VLLM_USE_DEEP_GEMM",
"VLLM_USE_DEEP_GEMM_E8M0",
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER",
"VLLM_USE_TRTLLM_FP4_GEMM",
"VLLM_USE_FUSED_MOE_GROUPED_TOPK",
"VLLM_USE_FLASHINFER_MOE_FP16",
Expand Down
14 changes: 7 additions & 7 deletions vllm/transformers_utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,25 +628,25 @@ def get_config(

if quantization_config is not None:
config.quantization_config = quantization_config
# auto-enable DeepGEMM UE8M0 on Hopper if model config requests it
# auto-enable DeepGEMM UE8M0 if model config requests it
scale_fmt = quantization_config.get("scale_fmt", None)
if scale_fmt in ("ue8m0",):
if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"):
os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1"
if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0"):
os.environ["VLLM_USE_DEEP_GEMM_E8M0"] = "1"
logger.info_once(
(
"Detected quantization_config.scale_fmt=%s; "
"enabling Hopper UE8M0."
"enabling UE8M0 for DeepGEMM."
),
scale_fmt,
)
elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
elif not envs.VLLM_USE_DEEP_GEMM_E8M0:
logger.warning_once(
(
"Model config requests UE8M0 "
"(quantization_config.scale_fmt=%s), but "
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
"Hopper UE8M0 disabled."
"VLLM_USE_DEEP_GEMM_E8M0=0 is set; "
"UE8M0 for DeepGEMM disabled."
),
scale_fmt,
)
Expand Down
18 changes: 3 additions & 15 deletions vllm/utils/deep_gemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,7 @@ def is_deep_gemm_supported() -> bool:
current_platform.is_device_capability(90)
or current_platform.is_device_capability(100)
)
return (
envs.VLLM_USE_DEEP_GEMM
and has_deep_gemm()
and is_supported_arch
and not envs.VLLM_USE_FLASHINFER_MOE_FP8
)
return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch


@functools.cache
Expand All @@ -58,15 +53,8 @@ def is_deep_gemm_e8m0_used() -> bool:
logger.info_once("DeepGEMM E8M0 disabled: FlashInfer MOE is enabled.")
return False

if current_platform.is_device_capability(100) and envs.VLLM_USE_DEEP_GEMM_E8M0:
logger.info_once("DeepGEMM E8M0 enabled on Blackwell GPU.")
return True

if (
current_platform.is_device_capability(90)
and envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER
):
logger.info_once("DeepGEMM E8M0 enabled on Hopper GPU.")
if envs.VLLM_USE_DEEP_GEMM_E8M0:
logger.info_once("DeepGEMM E8M0 enabled on current platform.")
return True

logger.info_once("DeepGEMM E8M0 disabled on current configuration.")
Expand Down