From 213d49f4fbece3aa4219f8f34347dd4054cca613 Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <sobereddiezhang@gmail.com>
Date: Fri, 27 Mar 2026 00:59:31 -0700
Subject: [PATCH 1/4] remove fp4 gemm env

---
 docs/advanced_features/server_arguments.md    |  2 +-
 docs/references/environment_variables.md      |  1 -
 python/sglang/srt/environ.py                  |  5 -----
 .../srt/layers/quantization/fp4_utils.py      | 21 -------------------
 python/sglang/srt/server_args.py              |  4 +---
 5 files changed, 2 insertions(+), 31 deletions(-)

diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md
index 61cfe91e07c6..f3add9ea6997 100644
--- a/docs/advanced_features/server_arguments.md
+++ b/docs/advanced_features/server_arguments.md
@@ -269,7 +269,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
 | `--nsa-prefill-backend` | Choose the NSA backend for the prefill stage (overrides `--attention-backend` when running DeepSeek NSA-style attention). | `flashmla_sparse` | `flashmla_sparse`, `flashmla_kv`, `flashmla_auto`, `fa3`, `tilelang`, `aiter`, `trtllm` |
 | `--nsa-decode-backend` | Choose the NSA backend for the decode stage when running DeepSeek NSA-style attention. Overrides `--attention-backend` for decoding. | `fa3` | `flashmla_sparse`, `flashmla_kv`, `fa3`, `tilelang`, `aiter`, `trtllm` |
 | `--fp8-gemm-backend` | Choose the runner backend for Blockwise FP8 GEMM operations. Options: 'auto' (default, auto-selects based on hardware), 'deep_gemm' (JIT-compiled; enabled by default on NVIDIA Hopper (SM90) and Blackwell (SM100) when DeepGEMM is installed), 'flashinfer_trtllm' (FlashInfer TRTLLM backend; SM100/SM103 only), 'flashinfer_cutlass' (FlashInfer CUTLASS backend, SM120 only), 'flashinfer_deepgemm' (Hopper SM90 only, uses swapAB optimization for small M dimensions in decoding), 'cutlass' (optimal for Hopper/Blackwell GPUs and high-throughput), 'triton' (fallback, widely compatible), 'aiter' (ROCm only). **NOTE**: This replaces the deprecated environment variables SGLANG_ENABLE_FLASHINFER_FP8_GEMM and SGLANG_SUPPORT_CUTLASS_BLOCK_FP8. | `auto` | `auto`, `deep_gemm`, `flashinfer_trtllm`, `flashinfer_cutlass`, `flashinfer_deepgemm`, `cutlass`, `triton`, `aiter` |
-| `--fp4-gemm-backend` | Choose the runner backend for NVFP4 GEMM operations. Options: 'flashinfer_cutlass' (default), 'auto' (auto-selects between flashinfer_cudnn/flashinfer_cutlass based on CUDA/cuDNN version), 'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), 'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). All backends are from FlashInfer; when FlashInfer is unavailable, sgl-kernel CUTLASS is used as an automatic fallback. **NOTE**: This replaces the deprecated environment variable SGLANG_FLASHINFER_FP4_GEMM_BACKEND. | `flashinfer_cutlass` | `auto`, `flashinfer_cudnn`, `flashinfer_cutlass`, `flashinfer_trtllm` |
+| `--fp4-gemm-backend` | Choose the runner backend for NVFP4 GEMM operations. Options: 'flashinfer_cutlass' (default), 'auto' (auto-selects between flashinfer_cudnn/flashinfer_cutlass based on CUDA/cuDNN version), 'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), 'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). All backends are from FlashInfer; when FlashInfer is unavailable, sgl-kernel CUTLASS is used as an automatic fallback.| `flashinfer_cutlass` | `auto`, `flashinfer_cudnn`, `flashinfer_cutlass`, `flashinfer_trtllm` |
 | `--disable-flashinfer-autotune` | Flashinfer autotune is enabled by default. Set this flag to disable the autotune. | `False` | bool flag (set to enable) |
 
 ## Speculative decoding
diff --git a/docs/references/environment_variables.md b/docs/references/environment_variables.md
index 9d95c4f237aa..ae3051337b49 100644
--- a/docs/references/environment_variables.md
+++ b/docs/references/environment_variables.md
@@ -119,7 +119,6 @@ SGLang supports various environment variables that can be used to configure its
 | `SGLANG_INT4_WEIGHT` | Enable INT4 weight quantization | `false` |
 | `SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2` | Apply per token group quantization kernel with fused silu and mul and masked m | `false` |
 | `SGLANG_FORCE_FP8_MARLIN` | Force using FP8 MARLIN kernels even if other FP8 kernels are available | `false` |
-| `SGLANG_FLASHINFER_FP4_GEMM_BACKEND` (deprecated) | Select backend for `mm_fp4` on Blackwell GPUs. **DEPRECATED**: Please use `--fp4-gemm-backend` instead. | `` |
 | `SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN` | Quantize q_b_proj from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` |
 | `SGLANG_MOE_NVFP4_DISPATCH` | Use nvfp4 for moe dispatch (on flashinfer_cutlass or flashinfer_cutedsl moe runner backend) | `"false"` |
 | `SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE` | Quantize moe of nextn layer from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` |
diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py
index 4dcf0613bd91..52400234882d 100644
--- a/python/sglang/srt/environ.py
+++ b/python/sglang/srt/environ.py
@@ -339,7 +339,6 @@ class Envs:
     SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
     SGLANG_ENABLE_FLASHINFER_FP8_GEMM = EnvBool(False)
     # Default to the pick from flashinfer
-    SGLANG_FLASHINFER_FP4_GEMM_BACKEND = EnvStr("")
     SGLANG_FLASHINFER_WORKSPACE_SIZE = EnvInt(384 * 1024 * 1024)
     # TODO(mmangkad): Remove this once the FlashInfer unified allreduce-fusion
     # transport issue on GB200/GB300 platforms is fixed and verified resolved.
@@ -594,10 +593,6 @@ def _convert_SGL_to_SGLANG():
     "SGLANG_SUPPORT_CUTLASS_BLOCK_FP8",
     "It will be completely removed in 0.5.7. Please use '--fp8-gemm-backend=cutlass' instead.",
 )
-_warn_deprecated_env_to_cli_flag(
-    "SGLANG_FLASHINFER_FP4_GEMM_BACKEND",
-    "It will be completely removed in 0.5.9. Please use '--fp4-gemm-backend' instead.",
-)
 _warn_deprecated_env_to_cli_flag(
     "SGLANG_SCHEDULER_DECREASE_PREFILL_IDLE",
     "Please use '--enable-prefill-delayer' instead.",
diff --git a/python/sglang/srt/layers/quantization/fp4_utils.py b/python/sglang/srt/layers/quantization/fp4_utils.py
index 3e913e137f02..0a8b39146efb 100644
--- a/python/sglang/srt/layers/quantization/fp4_utils.py
+++ b/python/sglang/srt/layers/quantization/fp4_utils.py
@@ -4,7 +4,6 @@
 from enum import Enum
 from typing import TYPE_CHECKING
 
-from sglang.srt.environ import envs
 from sglang.srt.utils.common import is_sm120_supported
 
 if TYPE_CHECKING:
@@ -56,26 +55,6 @@ def initialize_fp4_gemm_config(server_args: ServerArgs) -> None:
     global FP4_GEMM_RUNNER_BACKEND
 
     backend = server_args.fp4_gemm_runner_backend
-
-    # Handle deprecated env var for backward compatibility
-    # TODO: Remove this in a future version
-    if envs.SGLANG_FLASHINFER_FP4_GEMM_BACKEND.is_set():
-        env_backend = envs.SGLANG_FLASHINFER_FP4_GEMM_BACKEND.get()
-        if backend == "auto":
-            logger.warning(
-                "SGLANG_FLASHINFER_FP4_GEMM_BACKEND is deprecated. "
-                f"Please use '--fp4-gemm-backend={env_backend}' instead."
-            )
-            if not env_backend.startswith("flashinfer_"):
-                env_backend = "flashinfer_" + env_backend
-            backend = env_backend
-        else:
-            logger.warning(
-                f"FP4 GEMM backend set to '{backend}' via --fp4-gemm-backend overrides "
-                "environment variable SGLANG_FLASHINFER_FP4_GEMM_BACKEND. "
-                "Using server argument value."
-            )
-
     if backend == "auto":
         if is_sm120_supported():
             # flashinfer_cutlass produces NaN in dense MLP layers with
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index c770f3d161f4..5395b47874b2 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -4646,9 +4646,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "Options: 'auto' (default; selects flashinfer_cudnn on SM120, flashinfer_cutlass otherwise), "
             "'flashinfer_cutlass' (CUTLASS backend), "
             "'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), "
-            "'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). "
-            "NOTE: This replaces the deprecated environment variable "
-            "SGLANG_FLASHINFER_FP4_GEMM_BACKEND.",
+            "'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). ",
         )
         parser.add_argument(
             "--disable-flashinfer-autotune",

From b7b7edd2a5cae2d8aceeb7fef451089256f674bc Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <sobereddiezhang@gmail.com>
Date: Fri, 27 Mar 2026 01:03:05 -0700
Subject: [PATCH 2/4] remove fp8 gemm environ

---
 docs/references/environment_variables.md           |  1 -
 python/sglang/srt/environ.py                       |  6 ------
 python/sglang/srt/layers/quantization/fp8_utils.py | 10 ++--------
 python/sglang/srt/server_args.py                   |  4 +---
 4 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/docs/references/environment_variables.md b/docs/references/environment_variables.md
index ae3051337b49..22d3481b3ffd 100644
--- a/docs/references/environment_variables.md
+++ b/docs/references/environment_variables.md
@@ -122,7 +122,6 @@ SGLang supports various environment variables that can be used to configure its
 | `SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN` | Quantize q_b_proj from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` |
 | `SGLANG_MOE_NVFP4_DISPATCH` | Use nvfp4 for moe dispatch (on flashinfer_cutlass or flashinfer_cutedsl moe runner backend) | `"false"` |
 | `SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE` | Quantize moe of nextn layer from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` |
-| `SGLANG_ENABLE_FLASHINFER_FP8_GEMM` (deprecated) | Use flashinfer kernels when running blockwise fp8 GEMM on Blackwell GPUs. **DEPRECATED**: Please use `--fp8-gemm-backend=flashinfer_trtllm` (SM100/SM103) or `--fp8-gemm-backend=flashinfer_cutlass` (SM120/SM121 and newer) instead. | `false` |
 | `SGLANG_SUPPORT_CUTLASS_BLOCK_FP8` (deprecated) | Use Cutlass kernels when running blockwise fp8 GEMM on Hopper or Blackwell GPUs. **DEPRECATED**: Please use `--fp8-gemm-backend=cutlass` instead. | `false` |
 | `SGLANG_QUANT_ALLOW_DOWNCASTING` | Allow weight dtype downcasting during loading (e.g., fp32 → fp16). By default, SGLang rejects this kind of downcasting when using quantization. | `false` |
 | `SGLANG_FP8_IGNORED_LAYERS` | A comma-separated list of layer names to ignore during FP8 quantization. For example: `model.layers.0,model.layers.1.,qkv_proj`. | `""` |
diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py
index 52400234882d..efba9204145d 100644
--- a/python/sglang/srt/environ.py
+++ b/python/sglang/srt/environ.py
@@ -337,7 +337,6 @@ class Envs:
 
     # Flashinfer
     SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
-    SGLANG_ENABLE_FLASHINFER_FP8_GEMM = EnvBool(False)
     # Default to the pick from flashinfer
     SGLANG_FLASHINFER_WORKSPACE_SIZE = EnvInt(384 * 1024 * 1024)
     # TODO(mmangkad): Remove this once the FlashInfer unified allreduce-fusion
@@ -580,11 +579,6 @@ def _convert_SGL_to_SGLANG():
 
 
 _convert_SGL_to_SGLANG()
-
-_warn_deprecated_env_to_cli_flag(
-    "SGLANG_ENABLE_FLASHINFER_FP8_GEMM",
-    "It will be completely removed in 0.5.7. Please use '--fp8-gemm-backend=flashinfer_trtllm' instead.",
-)
 _warn_deprecated_env_to_cli_flag(
     "SGLANG_ENABLE_FLASHINFER_GEMM",
     "It will be completely removed in 0.5.7. Please use '--fp8-gemm-backend=flashinfer_trtllm' instead.",
diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py
index 3f26b736a6a9..72137cfec7c3 100755
--- a/python/sglang/srt/layers/quantization/fp8_utils.py
+++ b/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -457,18 +457,12 @@ def initialize_fp8_gemm_config(server_args: ServerArgs) -> None:
     # TODO(brayden): Remove env-based overrides in v0.5.7, they will be fully removed in v0.5.7.
     # Only check environment variables when the server args is not set, server args should take priority.
     if backend == "auto":
-        if envs.SGLANG_ENABLE_FLASHINFER_FP8_GEMM.get():
-            backend = "flashinfer_trtllm"
-        elif envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get():
+        if envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get():
             backend = "cutlass"
     else:
-        if (
-            envs.SGLANG_ENABLE_FLASHINFER_FP8_GEMM.get()
-            or envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get()
-        ):
+        if envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get():
             logger.warning(
                 f"FP8 GEMM backend set to '{backend}' via --fp8-gemm-backend overrides "
-                "environment variables SGLANG_ENABLE_FLASHINFER_FP8_GEMM and "
                 "SGLANG_SUPPORT_CUTLASS_BLOCK_FP8. Using server argument value."
             )
 
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 5395b47874b2..9958369454c3 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -4632,9 +4632,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "'flashinfer_deepgemm' (Hopper SM90 only; uses swapAB optimization for small M dimensions in decoding), "
             "'cutlass' (optimal for Hopper/Blackwell GPUs and high-throughput), "
             "'triton' (fallback, widely compatible), "
-            "'aiter' (ROCm only). "
-            "NOTE: This replaces the deprecated environment variables "
-            "SGLANG_ENABLE_FLASHINFER_FP8_GEMM and SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.",
+            "'aiter' (ROCm only). ",
         )
         parser.add_argument(
             "--fp4-gemm-backend",

From 5c63919416175358f0c8c1b9646d50a18939f0f6 Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <sobereddiezhang@gmail.com>
Date: Fri, 27 Mar 2026 01:04:14 -0700
Subject: [PATCH 3/4] remove more

---
 docs/advanced_features/server_arguments.md         |  2 +-
 docs/references/environment_variables.md           |  1 -
 python/sglang/srt/environ.py                       |  5 -----
 python/sglang/srt/layers/quantization/fp8_utils.py | 14 --------------
 4 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md
index f3add9ea6997..7de47a535a89 100644
--- a/docs/advanced_features/server_arguments.md
+++ b/docs/advanced_features/server_arguments.md
@@ -268,7 +268,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
 | `--mm-attention-backend` | Set multimodal attention backend. | `None` | `sdpa`, `fa3`, `fa4`, `triton_attn`, `ascend_attn`, `aiter_attn` |
 | `--nsa-prefill-backend` | Choose the NSA backend for the prefill stage (overrides `--attention-backend` when running DeepSeek NSA-style attention). | `flashmla_sparse` | `flashmla_sparse`, `flashmla_kv`, `flashmla_auto`, `fa3`, `tilelang`, `aiter`, `trtllm` |
 | `--nsa-decode-backend` | Choose the NSA backend for the decode stage when running DeepSeek NSA-style attention. Overrides `--attention-backend` for decoding. | `fa3` | `flashmla_sparse`, `flashmla_kv`, `fa3`, `tilelang`, `aiter`, `trtllm` |
-| `--fp8-gemm-backend` | Choose the runner backend for Blockwise FP8 GEMM operations. Options: 'auto' (default, auto-selects based on hardware), 'deep_gemm' (JIT-compiled; enabled by default on NVIDIA Hopper (SM90) and Blackwell (SM100) when DeepGEMM is installed), 'flashinfer_trtllm' (FlashInfer TRTLLM backend; SM100/SM103 only), 'flashinfer_cutlass' (FlashInfer CUTLASS backend, SM120 only), 'flashinfer_deepgemm' (Hopper SM90 only, uses swapAB optimization for small M dimensions in decoding), 'cutlass' (optimal for Hopper/Blackwell GPUs and high-throughput), 'triton' (fallback, widely compatible), 'aiter' (ROCm only). **NOTE**: This replaces the deprecated environment variables SGLANG_ENABLE_FLASHINFER_FP8_GEMM and SGLANG_SUPPORT_CUTLASS_BLOCK_FP8. | `auto` | `auto`, `deep_gemm`, `flashinfer_trtllm`, `flashinfer_cutlass`, `flashinfer_deepgemm`, `cutlass`, `triton`, `aiter` |
+| `--fp8-gemm-backend` | Choose the runner backend for Blockwise FP8 GEMM operations. Options: 'auto' (default, auto-selects based on hardware), 'deep_gemm' (JIT-compiled; enabled by default on NVIDIA Hopper (SM90) and Blackwell (SM100) when DeepGEMM is installed), 'flashinfer_trtllm' (FlashInfer TRTLLM backend; SM100/SM103 only), 'flashinfer_cutlass' (FlashInfer CUTLASS backend, SM120 only), 'flashinfer_deepgemm' (Hopper SM90 only, uses swapAB optimization for small M dimensions in decoding), 'cutlass' (optimal for Hopper/Blackwell GPUs and high-throughput), 'triton' (fallback, widely compatible), 'aiter' (ROCm only).| `auto` | `auto`, `deep_gemm`, `flashinfer_trtllm`, `flashinfer_cutlass`, `flashinfer_deepgemm`, `cutlass`, `triton`, `aiter` |
 | `--fp4-gemm-backend` | Choose the runner backend for NVFP4 GEMM operations. Options: 'flashinfer_cutlass' (default), 'auto' (auto-selects between flashinfer_cudnn/flashinfer_cutlass based on CUDA/cuDNN version), 'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), 'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). All backends are from FlashInfer; when FlashInfer is unavailable, sgl-kernel CUTLASS is used as an automatic fallback.| `flashinfer_cutlass` | `auto`, `flashinfer_cudnn`, `flashinfer_cutlass`, `flashinfer_trtllm` |
 | `--disable-flashinfer-autotune` | Flashinfer autotune is enabled by default. Set this flag to disable the autotune. | `False` | bool flag (set to enable) |
 
diff --git a/docs/references/environment_variables.md b/docs/references/environment_variables.md
index 22d3481b3ffd..29d6b6962b0d 100644
--- a/docs/references/environment_variables.md
+++ b/docs/references/environment_variables.md
@@ -122,7 +122,6 @@ SGLang supports various environment variables that can be used to configure its
 | `SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN` | Quantize q_b_proj from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` |
 | `SGLANG_MOE_NVFP4_DISPATCH` | Use nvfp4 for moe dispatch (on flashinfer_cutlass or flashinfer_cutedsl moe runner backend) | `"false"` |
 | `SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE` | Quantize moe of nextn layer from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` |
-| `SGLANG_SUPPORT_CUTLASS_BLOCK_FP8` (deprecated) | Use Cutlass kernels when running blockwise fp8 GEMM on Hopper or Blackwell GPUs. **DEPRECATED**: Please use `--fp8-gemm-backend=cutlass` instead. | `false` |
 | `SGLANG_QUANT_ALLOW_DOWNCASTING` | Allow weight dtype downcasting during loading (e.g., fp32 → fp16). By default, SGLang rejects this kind of downcasting when using quantization. | `false` |
 | `SGLANG_FP8_IGNORED_LAYERS` | A comma-separated list of layer names to ignore during FP8 quantization. For example: `model.layers.0,model.layers.1.,qkv_proj`. | `""` |
 
diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py
index efba9204145d..7551b0c8402e 100644
--- a/python/sglang/srt/environ.py
+++ b/python/sglang/srt/environ.py
@@ -406,7 +406,6 @@ class Envs:
     DISABLE_OPENAPI_DOC = EnvBool(False)
     SGLANG_ENABLE_TORCH_INFERENCE_MODE = EnvBool(False)
     SGLANG_IS_FIRST_RANK_ON_NODE = EnvBool(True)
-    SGLANG_SUPPORT_CUTLASS_BLOCK_FP8 = EnvBool(False)
     SGLANG_SYNC_TOKEN_IDS_ACROSS_TP = EnvBool(False)
     SGLANG_ENABLE_COLOCATED_BATCH_GEN = EnvBool(False)
 
@@ -583,10 +582,6 @@ def _convert_SGL_to_SGLANG():
     "SGLANG_ENABLE_FLASHINFER_GEMM",
     "It will be completely removed in 0.5.7. Please use '--fp8-gemm-backend=flashinfer_trtllm' instead.",
 )
-_warn_deprecated_env_to_cli_flag(
-    "SGLANG_SUPPORT_CUTLASS_BLOCK_FP8",
-    "It will be completely removed in 0.5.7. Please use '--fp8-gemm-backend=cutlass' instead.",
-)
 _warn_deprecated_env_to_cli_flag(
     "SGLANG_SCHEDULER_DECREASE_PREFILL_IDLE",
     "Please use '--enable-prefill-delayer' instead.",
diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py
index 72137cfec7c3..bdc0430ff3ae 100755
--- a/python/sglang/srt/layers/quantization/fp8_utils.py
+++ b/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -7,7 +7,6 @@
 
 import torch
 
-from sglang.srt.environ import envs
 from sglang.srt.layers import deep_gemm_wrapper
 from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8
 from sglang.srt.layers.quantization.mxfp4_tensor import MXFP4QuantizeUtil
@@ -453,19 +452,6 @@ def initialize_fp8_gemm_config(server_args: ServerArgs) -> None:
     global FP8_GEMM_RUNNER_BACKEND
 
     backend = server_args.fp8_gemm_runner_backend
-
-    # TODO(brayden): Remove env-based overrides in v0.5.7, they will be fully removed in v0.5.7.
-    # Only check environment variables when the server args is not set, server args should take priority.
-    if backend == "auto":
-        if envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get():
-            backend = "cutlass"
-    else:
-        if envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get():
-            logger.warning(
-                f"FP8 GEMM backend set to '{backend}' via --fp8-gemm-backend overrides "
-                "SGLANG_SUPPORT_CUTLASS_BLOCK_FP8. Using server argument value."
-            )
-
     if backend == "auto" and is_sm120_supported():
         # TODO(brayden): Verify if CUTLASS can be set by default once SwapAB is supported
         backend = "triton"

From aa0bc906129c6815156665f1b5d3f4c8a9e833b9 Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <sobereddiezhang@gmail.com>
Date: Fri, 27 Mar 2026 01:05:44 -0700
Subject: [PATCH 4/4] upd

---
 python/sglang/srt/environ.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py
index 7551b0c8402e..e96abfcb1911 100644
--- a/python/sglang/srt/environ.py
+++ b/python/sglang/srt/environ.py
@@ -545,9 +545,6 @@ def _warn_deprecated_env_to_cli_flag(env_name: str, suggestion: str):
 
 def _convert_SGL_to_SGLANG():
     _print_deprecated_env("SGLANG_LOG_GC", "SGLANG_GC_LOG")
-    _print_deprecated_env(
-        "SGLANG_ENABLE_FLASHINFER_FP8_GEMM", "SGLANG_ENABLE_FLASHINFER_GEMM"
-    )
     _print_deprecated_env(
         "SGLANG_MOE_NVFP4_DISPATCH", "SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH"
     )
@@ -578,10 +575,6 @@ def _convert_SGL_to_SGLANG():
 
 
 _convert_SGL_to_SGLANG()
-_warn_deprecated_env_to_cli_flag(
-    "SGLANG_ENABLE_FLASHINFER_GEMM",
-    "It will be completely removed in 0.5.7. Please use '--fp8-gemm-backend=flashinfer_trtllm' instead.",
-)
 _warn_deprecated_env_to_cli_flag(
     "SGLANG_SCHEDULER_DECREASE_PREFILL_IDLE",
     "Please use '--enable-prefill-delayer' instead.",