Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 112 additions & 33 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys
import tempfile
import uuid
import warnings
from collections.abc import Callable
from typing import TYPE_CHECKING, Any, Literal

Expand Down Expand Up @@ -342,6 +343,27 @@ def use_mega_aot_artifact():
return os.environ.get("VLLM_USE_MEGA_AOT_ARTIFACT", default_value) == "1"


def deprecated_env(
env_name: str,
removal_version: str,
replacement: str,
getter: Callable[[], Any],
) -> Callable[[], Any]:
"""Wrap an env-var getter to emit a FutureWarning when the var is set."""

def _read() -> Any:
if env_name in os.environ:
warnings.warn(
f"{env_name} is deprecated and will be removed in "
f"{removal_version}. {replacement}",
FutureWarning,
stacklevel=2,
)
return getter()

return _read


def env_with_choices(
env_name: str,
default: str | None,
Expand Down Expand Up @@ -1294,8 +1316,13 @@ def _resolve_rust_frontend_path() -> str | None:
os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1"
),
# Whether to use marlin kernel in mxfp4 quantization method
"VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
os.environ.get("VLLM_MXFP4_USE_MARLIN", None)
# Deprecated: use --moe-backend marlin (MoE) or --linear-backend marlin
# (linear) instead.
"VLLM_MXFP4_USE_MARLIN": deprecated_env(
"VLLM_MXFP4_USE_MARLIN",
"v0.23",
"Use --moe-backend marlin or --linear-backend marlin.",
lambda: maybe_convert_bool(os.environ.get("VLLM_MXFP4_USE_MARLIN", None)),
),
# The activation dtype for marlin kernel
"VLLM_MARLIN_INPUT_DTYPE": env_with_choices(
Expand Down Expand Up @@ -1390,37 +1417,66 @@ def _resolve_rust_frontend_path() -> str | None:
int(os.getenv("VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER", "1"))
),
# Allow use of FlashInfer BF16 MoE kernels for fused moe ops.
"VLLM_USE_FLASHINFER_MOE_FP16": lambda: bool(
int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0"))
# Deprecated: use --moe-backend to select a kernel explicitly.
"VLLM_USE_FLASHINFER_MOE_FP16": deprecated_env(
"VLLM_USE_FLASHINFER_MOE_FP16",
"v0.23",
"Use --moe-backend (e.g. flashinfer_trtllm, flashinfer_cutlass).",
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0"))),
),
# Allow use of FlashInfer FP8 MoE kernels for fused moe ops.
"VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool(
int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))
# Deprecated: use --moe-backend to select a kernel explicitly.
"VLLM_USE_FLASHINFER_MOE_FP8": deprecated_env(
"VLLM_USE_FLASHINFER_MOE_FP8",
"v0.23",
"Use --moe-backend (e.g. flashinfer_trtllm, flashinfer_cutlass).",
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))),
),
# Allow use of FlashInfer NVFP4 MoE kernels for fused moe ops.
"VLLM_USE_FLASHINFER_MOE_FP4": lambda: bool(
int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))
# Deprecated: use --moe-backend to select a kernel explicitly.
"VLLM_USE_FLASHINFER_MOE_FP4": deprecated_env(
"VLLM_USE_FLASHINFER_MOE_FP4",
"v0.23",
"Use --moe-backend (e.g. flashinfer_trtllm, flashinfer_cutlass, "
"flashinfer_cutedsl).",
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))),
),
# Allow use of FlashInfer MxInt4 MoE kernels for fused moe ops.
"VLLM_USE_FLASHINFER_MOE_INT4": lambda: bool(
int(os.getenv("VLLM_USE_FLASHINFER_MOE_INT4", "0"))
),
# If set to 1, use the FlashInfer
# MXFP8 (activation) x MXFP4 (weight) MoE backend.
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": lambda: bool(
int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))
# Deprecated: use --moe-backend flashinfer_trtllm combined with
# --quantization_config.moe.activation mxfp8.
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": deprecated_env(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
"v0.23",
"Use --moe-backend flashinfer_trtllm with "
"--quantization_config.moe.activation mxfp8.",
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))),
),
# If set to 1, use the FlashInfer CUTLASS backend for
# MXFP8 (activation) x MXFP4 (weight) MoE.
# This is separate from the TRTLLMGEN path controlled by
# VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8.
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": lambda: bool(
int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "0"))
# Deprecated: use --moe-backend flashinfer_cutlass combined with
# --quantization_config.moe.activation mxfp8.
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": deprecated_env(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
"v0.23",
"Use --moe-backend flashinfer_cutlass with "
"--quantization_config.moe.activation mxfp8.",
lambda: bool(
int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "0"))
),
),
# If set to 1, use the FlashInfer
# BF16 (activation) x MXFP4 (weight) MoE backend.
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": lambda: bool(
int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))
# Deprecated: use --moe-backend to select a kernel explicitly.
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": deprecated_env(
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
"v0.23",
"Use --moe-backend (e.g. flashinfer_trtllm, flashinfer_cutlass).",
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))),
),
# Control the cache sized used by the xgrammar compiler. The default
# of 512 MB should be enough for roughly 1000 JSON schemas.
Expand Down Expand Up @@ -1480,10 +1536,17 @@ def _resolve_rust_frontend_path() -> str | None:
# Uses CUTLASS kernels optimized for high-throughput batch inference.
# - "latency":
# Uses TensorRT-LLM kernels optimized for low-latency inference.
"VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
# Deprecated: pass --moe-backend flashinfer_{trtllm,cutlass,cutedsl} directly.
"VLLM_FLASHINFER_MOE_BACKEND": deprecated_env(
"VLLM_FLASHINFER_MOE_BACKEND",
"latency",
["throughput", "latency", "masked_gemm"],
"v0.23",
"Use --moe-backend flashinfer_trtllm, flashinfer_cutlass, or "
"flashinfer_cutedsl.",
env_with_choices(
"VLLM_FLASHINFER_MOE_BACKEND",
"latency",
["throughput", "latency", "masked_gemm"],
),
),
# Override the directory for the FlashInfer autotune config cache.
"VLLM_FLASHINFER_AUTOTUNE_CACHE_DIR": lambda: os.getenv(
Expand Down Expand Up @@ -1565,8 +1628,12 @@ def _resolve_rust_frontend_path() -> str | None:
# Controls whether or not emulations are used for NVFP4
# generations on machines < 100 for compressed-tensors
# models
"VLLM_USE_NVFP4_CT_EMULATIONS": lambda: bool(
int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))
# Deprecated: use --linear-backend emulation instead.
"VLLM_USE_NVFP4_CT_EMULATIONS": deprecated_env(
"VLLM_USE_NVFP4_CT_EMULATIONS",
"v0.23",
"Use --linear-backend emulation.",
lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))),
),
# Controls the read mode for the Mori-IO connector
"VLLM_MORIIO_CONNECTOR_READ_MODE": lambda: (
Expand Down Expand Up @@ -1601,18 +1668,24 @@ def _resolve_rust_frontend_path() -> str | None:
# This is only meant for research purposes to run on devices where NVFP4
# GEMM kernels are not available.
# - <none>: automatically pick an available backend
"VLLM_NVFP4_GEMM_BACKEND": env_with_choices(
# Deprecated: use --linear-backend instead.
"VLLM_NVFP4_GEMM_BACKEND": deprecated_env(
"VLLM_NVFP4_GEMM_BACKEND",
None,
[
"flashinfer-b12x",
"flashinfer-cudnn",
"flashinfer-trtllm",
"flashinfer-cutlass",
"cutlass",
"marlin",
"emulation",
],
"v0.23",
"Use --linear-backend.",
env_with_choices(
"VLLM_NVFP4_GEMM_BACKEND",
None,
[
"flashinfer-b12x",
"flashinfer-cudnn",
"flashinfer-trtllm",
"flashinfer-cutlass",
"cutlass",
"marlin",
"emulation",
],
),
),
# Controls garbage collection during CUDA graph capture.
# If set to 0 (default), enables GC freezing to speed up capture time.
Expand Down Expand Up @@ -1763,7 +1836,13 @@ def _resolve_rust_frontend_path() -> str | None:
# NCCL header path
"VLLM_NCCL_INCLUDE_PATH": lambda: os.environ.get("VLLM_NCCL_INCLUDE_PATH", None),
# Flag to enable FBGemm kernels on model execution
"VLLM_USE_FBGEMM": lambda: bool(int(os.getenv("VLLM_USE_FBGEMM", "0"))),
# Deprecated: use --linear-backend fbgemm instead.
"VLLM_USE_FBGEMM": deprecated_env(
"VLLM_USE_FBGEMM",
"v0.23",
"Use --linear-backend fbgemm.",
lambda: bool(int(os.getenv("VLLM_USE_FBGEMM", "0"))),
),
# GC debug config
# - VLLM_GC_DEBUG=0: disable GC debugger
# - VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times
Expand Down
22 changes: 1 addition & 21 deletions vllm/model_executor/kernels/linear/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import stability.
"""

import warnings
from typing import TypeVar

import torch
Expand Down Expand Up @@ -851,31 +850,12 @@ def init_nvfp4_linear_kernel() -> NvFp4LinearKernel:
force_kernel = EmulationNvFp4LinearKernel
elif linear_backend == "auto":
# Deprecated env-var overrides — only honoured when --linear-backend
# is "auto". Will be removed in v0.21; users should migrate to
# --linear-backend.
# is "auto". Deprecation warnings are emitted from vllm/envs.py.
if envs.VLLM_USE_FBGEMM:
warnings.warn(
"VLLM_USE_FBGEMM is deprecated and will be removed in "
"v0.21. Use --linear-backend fbgemm instead.",
DeprecationWarning,
stacklevel=2,
)
force_kernel = FbgemmNvFp4LinearKernel
elif envs.VLLM_USE_NVFP4_CT_EMULATIONS:
warnings.warn(
"VLLM_USE_NVFP4_CT_EMULATIONS is deprecated and will be "
"removed in v0.21. Use --linear-backend emulation instead.",
DeprecationWarning,
stacklevel=2,
)
force_kernel = EmulationNvFp4LinearKernel
elif envs.VLLM_NVFP4_GEMM_BACKEND is not None:
warnings.warn(
"VLLM_NVFP4_GEMM_BACKEND is deprecated and will be "
"removed in v0.21. Use --linear-backend instead.",
DeprecationWarning,
stacklevel=2,
)
backend_name = envs.VLLM_NVFP4_GEMM_BACKEND
force_kernel = _NVFP4_BACKEND_TO_KERNEL.get(backend_name)
if force_kernel is None:
Expand Down
Loading