From 213d49f4fbece3aa4219f8f34347dd4054cca613 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 27 Mar 2026 00:59:31 -0700 Subject: [PATCH 1/4] remove fp4 gemm env --- docs/advanced_features/server_arguments.md | 2 +- docs/references/environment_variables.md | 1 - python/sglang/srt/environ.py | 5 ----- .../srt/layers/quantization/fp4_utils.py | 21 ------------------- python/sglang/srt/server_args.py | 4 +--- 5 files changed, 2 insertions(+), 31 deletions(-) diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md index 61cfe91e07c6..f3add9ea6997 100644 --- a/docs/advanced_features/server_arguments.md +++ b/docs/advanced_features/server_arguments.md @@ -269,7 +269,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--nsa-prefill-backend` | Choose the NSA backend for the prefill stage (overrides `--attention-backend` when running DeepSeek NSA-style attention). | `flashmla_sparse` | `flashmla_sparse`, `flashmla_kv`, `flashmla_auto`, `fa3`, `tilelang`, `aiter`, `trtllm` | | `--nsa-decode-backend` | Choose the NSA backend for the decode stage when running DeepSeek NSA-style attention. Overrides `--attention-backend` for decoding. | `fa3` | `flashmla_sparse`, `flashmla_kv`, `fa3`, `tilelang`, `aiter`, `trtllm` | | `--fp8-gemm-backend` | Choose the runner backend for Blockwise FP8 GEMM operations. Options: 'auto' (default, auto-selects based on hardware), 'deep_gemm' (JIT-compiled; enabled by default on NVIDIA Hopper (SM90) and Blackwell (SM100) when DeepGEMM is installed), 'flashinfer_trtllm' (FlashInfer TRTLLM backend; SM100/SM103 only), 'flashinfer_cutlass' (FlashInfer CUTLASS backend, SM120 only), 'flashinfer_deepgemm' (Hopper SM90 only, uses swapAB optimization for small M dimensions in decoding), 'cutlass' (optimal for Hopper/Blackwell GPUs and high-throughput), 'triton' (fallback, widely compatible), 'aiter' (ROCm only). **NOTE**: This replaces the deprecated environment variables SGLANG_ENABLE_FLASHINFER_FP8_GEMM and SGLANG_SUPPORT_CUTLASS_BLOCK_FP8. | `auto` | `auto`, `deep_gemm`, `flashinfer_trtllm`, `flashinfer_cutlass`, `flashinfer_deepgemm`, `cutlass`, `triton`, `aiter` | -| `--fp4-gemm-backend` | Choose the runner backend for NVFP4 GEMM operations. Options: 'flashinfer_cutlass' (default), 'auto' (auto-selects between flashinfer_cudnn/flashinfer_cutlass based on CUDA/cuDNN version), 'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), 'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). All backends are from FlashInfer; when FlashInfer is unavailable, sgl-kernel CUTLASS is used as an automatic fallback. **NOTE**: This replaces the deprecated environment variable SGLANG_FLASHINFER_FP4_GEMM_BACKEND. | `flashinfer_cutlass` | `auto`, `flashinfer_cudnn`, `flashinfer_cutlass`, `flashinfer_trtllm` | +| `--fp4-gemm-backend` | Choose the runner backend for NVFP4 GEMM operations. Options: 'flashinfer_cutlass' (default), 'auto' (auto-selects between flashinfer_cudnn/flashinfer_cutlass based on CUDA/cuDNN version), 'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), 'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). All backends are from FlashInfer; when FlashInfer is unavailable, sgl-kernel CUTLASS is used as an automatic fallback.| `flashinfer_cutlass` | `auto`, `flashinfer_cudnn`, `flashinfer_cutlass`, `flashinfer_trtllm` | | `--disable-flashinfer-autotune` | Flashinfer autotune is enabled by default. Set this flag to disable the autotune. | `False` | bool flag (set to enable) | ## Speculative decoding diff --git a/docs/references/environment_variables.md b/docs/references/environment_variables.md index 9d95c4f237aa..ae3051337b49 100644 --- a/docs/references/environment_variables.md +++ b/docs/references/environment_variables.md @@ -119,7 +119,6 @@ SGLang supports various environment variables that can be used to configure its | `SGLANG_INT4_WEIGHT` | Enable INT4 weight quantization | `false` | | `SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2` | Apply per token group quantization kernel with fused silu and mul and masked m | `false` | | `SGLANG_FORCE_FP8_MARLIN` | Force using FP8 MARLIN kernels even if other FP8 kernels are available | `false` | -| `SGLANG_FLASHINFER_FP4_GEMM_BACKEND` (deprecated) | Select backend for `mm_fp4` on Blackwell GPUs. **DEPRECATED**: Please use `--fp4-gemm-backend` instead. | `` | | `SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN` | Quantize q_b_proj from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` | | `SGLANG_MOE_NVFP4_DISPATCH` | Use nvfp4 for moe dispatch (on flashinfer_cutlass or flashinfer_cutedsl moe runner backend) | `"false"` | | `SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE` | Quantize moe of nextn layer from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` | diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index 4dcf0613bd91..52400234882d 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -339,7 +339,6 @@ class Envs: SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True) SGLANG_ENABLE_FLASHINFER_FP8_GEMM = EnvBool(False) # Default to the pick from flashinfer - SGLANG_FLASHINFER_FP4_GEMM_BACKEND = EnvStr("") SGLANG_FLASHINFER_WORKSPACE_SIZE = EnvInt(384 * 1024 * 1024) # TODO(mmangkad): Remove this once the FlashInfer unified allreduce-fusion # transport issue on GB200/GB300 platforms is fixed and verified resolved. @@ -594,10 +593,6 @@ def _convert_SGL_to_SGLANG(): "SGLANG_SUPPORT_CUTLASS_BLOCK_FP8", "It will be completely removed in 0.5.7. Please use '--fp8-gemm-backend=cutlass' instead.", ) -_warn_deprecated_env_to_cli_flag( - "SGLANG_FLASHINFER_FP4_GEMM_BACKEND", - "It will be completely removed in 0.5.9. Please use '--fp4-gemm-backend' instead.", -) _warn_deprecated_env_to_cli_flag( "SGLANG_SCHEDULER_DECREASE_PREFILL_IDLE", "Please use '--enable-prefill-delayer' instead.", diff --git a/python/sglang/srt/layers/quantization/fp4_utils.py b/python/sglang/srt/layers/quantization/fp4_utils.py index 3e913e137f02..0a8b39146efb 100644 --- a/python/sglang/srt/layers/quantization/fp4_utils.py +++ b/python/sglang/srt/layers/quantization/fp4_utils.py @@ -4,7 +4,6 @@ from enum import Enum from typing import TYPE_CHECKING -from sglang.srt.environ import envs from sglang.srt.utils.common import is_sm120_supported if TYPE_CHECKING: @@ -56,26 +55,6 @@ def initialize_fp4_gemm_config(server_args: ServerArgs) -> None: global FP4_GEMM_RUNNER_BACKEND backend = server_args.fp4_gemm_runner_backend - - # Handle deprecated env var for backward compatibility - # TODO: Remove this in a future version - if envs.SGLANG_FLASHINFER_FP4_GEMM_BACKEND.is_set(): - env_backend = envs.SGLANG_FLASHINFER_FP4_GEMM_BACKEND.get() - if backend == "auto": - logger.warning( - "SGLANG_FLASHINFER_FP4_GEMM_BACKEND is deprecated. " - f"Please use '--fp4-gemm-backend={env_backend}' instead." - ) - if not env_backend.startswith("flashinfer_"): - env_backend = "flashinfer_" + env_backend - backend = env_backend - else: - logger.warning( - f"FP4 GEMM backend set to '{backend}' via --fp4-gemm-backend overrides " - "environment variable SGLANG_FLASHINFER_FP4_GEMM_BACKEND. " - "Using server argument value." - ) - if backend == "auto": if is_sm120_supported(): # flashinfer_cutlass produces NaN in dense MLP layers with diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index c770f3d161f4..5395b47874b2 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -4646,9 +4646,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "Options: 'auto' (default; selects flashinfer_cudnn on SM120, flashinfer_cutlass otherwise), " "'flashinfer_cutlass' (CUTLASS backend), " "'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), " - "'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). " - "NOTE: This replaces the deprecated environment variable " - "SGLANG_FLASHINFER_FP4_GEMM_BACKEND.", + "'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). ", ) parser.add_argument( "--disable-flashinfer-autotune", From b7b7edd2a5cae2d8aceeb7fef451089256f674bc Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 27 Mar 2026 01:03:05 -0700 Subject: [PATCH 2/4] remove fp8 gemm environ --- docs/references/environment_variables.md | 1 - python/sglang/srt/environ.py | 6 ------ python/sglang/srt/layers/quantization/fp8_utils.py | 10 ++-------- python/sglang/srt/server_args.py | 4 +--- 4 files changed, 3 insertions(+), 18 deletions(-) diff --git a/docs/references/environment_variables.md b/docs/references/environment_variables.md index ae3051337b49..22d3481b3ffd 100644 --- a/docs/references/environment_variables.md +++ b/docs/references/environment_variables.md @@ -122,7 +122,6 @@ SGLang supports various environment variables that can be used to configure its | `SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN` | Quantize q_b_proj from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` | | `SGLANG_MOE_NVFP4_DISPATCH` | Use nvfp4 for moe dispatch (on flashinfer_cutlass or flashinfer_cutedsl moe runner backend) | `"false"` | | `SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE` | Quantize moe of nextn layer from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` | -| `SGLANG_ENABLE_FLASHINFER_FP8_GEMM` (deprecated) | Use flashinfer kernels when running blockwise fp8 GEMM on Blackwell GPUs. **DEPRECATED**: Please use `--fp8-gemm-backend=flashinfer_trtllm` (SM100/SM103) or `--fp8-gemm-backend=flashinfer_cutlass` (SM120/SM121 and newer) instead. | `false` | | `SGLANG_SUPPORT_CUTLASS_BLOCK_FP8` (deprecated) | Use Cutlass kernels when running blockwise fp8 GEMM on Hopper or Blackwell GPUs. **DEPRECATED**: Please use `--fp8-gemm-backend=cutlass` instead. | `false` | | `SGLANG_QUANT_ALLOW_DOWNCASTING` | Allow weight dtype downcasting during loading (e.g., fp32 → fp16). By default, SGLang rejects this kind of downcasting when using quantization. | `false` | | `SGLANG_FP8_IGNORED_LAYERS` | A comma-separated list of layer names to ignore during FP8 quantization. For example: `model.layers.0,model.layers.1.,qkv_proj`. | `""` | diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index 52400234882d..efba9204145d 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -337,7 +337,6 @@ class Envs: # Flashinfer SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True) - SGLANG_ENABLE_FLASHINFER_FP8_GEMM = EnvBool(False) # Default to the pick from flashinfer SGLANG_FLASHINFER_WORKSPACE_SIZE = EnvInt(384 * 1024 * 1024) # TODO(mmangkad): Remove this once the FlashInfer unified allreduce-fusion @@ -580,11 +579,6 @@ def _convert_SGL_to_SGLANG(): _convert_SGL_to_SGLANG() - -_warn_deprecated_env_to_cli_flag( - "SGLANG_ENABLE_FLASHINFER_FP8_GEMM", - "It will be completely removed in 0.5.7. Please use '--fp8-gemm-backend=flashinfer_trtllm' instead.", -) _warn_deprecated_env_to_cli_flag( "SGLANG_ENABLE_FLASHINFER_GEMM", "It will be completely removed in 0.5.7. Please use '--fp8-gemm-backend=flashinfer_trtllm' instead.", diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py index 3f26b736a6a9..72137cfec7c3 100755 --- a/python/sglang/srt/layers/quantization/fp8_utils.py +++ b/python/sglang/srt/layers/quantization/fp8_utils.py @@ -457,18 +457,12 @@ def initialize_fp8_gemm_config(server_args: ServerArgs) -> None: # TODO(brayden): Remove env-based overrides in v0.5.7, they will be fully removed in v0.5.7. # Only check environment variables when the server args is not set, server args should take priority. if backend == "auto": - if envs.SGLANG_ENABLE_FLASHINFER_FP8_GEMM.get(): - backend = "flashinfer_trtllm" - elif envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get(): + if envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get(): backend = "cutlass" else: - if ( - envs.SGLANG_ENABLE_FLASHINFER_FP8_GEMM.get() - or envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get() - ): + if envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get(): logger.warning( f"FP8 GEMM backend set to '{backend}' via --fp8-gemm-backend overrides " - "environment variables SGLANG_ENABLE_FLASHINFER_FP8_GEMM and " "SGLANG_SUPPORT_CUTLASS_BLOCK_FP8. Using server argument value." ) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 5395b47874b2..9958369454c3 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -4632,9 +4632,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "'flashinfer_deepgemm' (Hopper SM90 only; uses swapAB optimization for small M dimensions in decoding), " "'cutlass' (optimal for Hopper/Blackwell GPUs and high-throughput), " "'triton' (fallback, widely compatible), " - "'aiter' (ROCm only). " - "NOTE: This replaces the deprecated environment variables " - "SGLANG_ENABLE_FLASHINFER_FP8_GEMM and SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.", + "'aiter' (ROCm only). ", ) parser.add_argument( "--fp4-gemm-backend", From 5c63919416175358f0c8c1b9646d50a18939f0f6 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 27 Mar 2026 01:04:14 -0700 Subject: [PATCH 3/4] remove more --- docs/advanced_features/server_arguments.md | 2 +- docs/references/environment_variables.md | 1 - python/sglang/srt/environ.py | 5 ----- python/sglang/srt/layers/quantization/fp8_utils.py | 14 -------------- 4 files changed, 1 insertion(+), 21 deletions(-) diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md index f3add9ea6997..7de47a535a89 100644 --- a/docs/advanced_features/server_arguments.md +++ b/docs/advanced_features/server_arguments.md @@ -268,7 +268,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--mm-attention-backend` | Set multimodal attention backend. | `None` | `sdpa`, `fa3`, `fa4`, `triton_attn`, `ascend_attn`, `aiter_attn` | | `--nsa-prefill-backend` | Choose the NSA backend for the prefill stage (overrides `--attention-backend` when running DeepSeek NSA-style attention). | `flashmla_sparse` | `flashmla_sparse`, `flashmla_kv`, `flashmla_auto`, `fa3`, `tilelang`, `aiter`, `trtllm` | | `--nsa-decode-backend` | Choose the NSA backend for the decode stage when running DeepSeek NSA-style attention. Overrides `--attention-backend` for decoding. | `fa3` | `flashmla_sparse`, `flashmla_kv`, `fa3`, `tilelang`, `aiter`, `trtllm` | -| `--fp8-gemm-backend` | Choose the runner backend for Blockwise FP8 GEMM operations. Options: 'auto' (default, auto-selects based on hardware), 'deep_gemm' (JIT-compiled; enabled by default on NVIDIA Hopper (SM90) and Blackwell (SM100) when DeepGEMM is installed), 'flashinfer_trtllm' (FlashInfer TRTLLM backend; SM100/SM103 only), 'flashinfer_cutlass' (FlashInfer CUTLASS backend, SM120 only), 'flashinfer_deepgemm' (Hopper SM90 only, uses swapAB optimization for small M dimensions in decoding), 'cutlass' (optimal for Hopper/Blackwell GPUs and high-throughput), 'triton' (fallback, widely compatible), 'aiter' (ROCm only). **NOTE**: This replaces the deprecated environment variables SGLANG_ENABLE_FLASHINFER_FP8_GEMM and SGLANG_SUPPORT_CUTLASS_BLOCK_FP8. | `auto` | `auto`, `deep_gemm`, `flashinfer_trtllm`, `flashinfer_cutlass`, `flashinfer_deepgemm`, `cutlass`, `triton`, `aiter` | +| `--fp8-gemm-backend` | Choose the runner backend for Blockwise FP8 GEMM operations. Options: 'auto' (default, auto-selects based on hardware), 'deep_gemm' (JIT-compiled; enabled by default on NVIDIA Hopper (SM90) and Blackwell (SM100) when DeepGEMM is installed), 'flashinfer_trtllm' (FlashInfer TRTLLM backend; SM100/SM103 only), 'flashinfer_cutlass' (FlashInfer CUTLASS backend, SM120 only), 'flashinfer_deepgemm' (Hopper SM90 only, uses swapAB optimization for small M dimensions in decoding), 'cutlass' (optimal for Hopper/Blackwell GPUs and high-throughput), 'triton' (fallback, widely compatible), 'aiter' (ROCm only).| `auto` | `auto`, `deep_gemm`, `flashinfer_trtllm`, `flashinfer_cutlass`, `flashinfer_deepgemm`, `cutlass`, `triton`, `aiter` | | `--fp4-gemm-backend` | Choose the runner backend for NVFP4 GEMM operations. Options: 'flashinfer_cutlass' (default), 'auto' (auto-selects between flashinfer_cudnn/flashinfer_cutlass based on CUDA/cuDNN version), 'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), 'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). All backends are from FlashInfer; when FlashInfer is unavailable, sgl-kernel CUTLASS is used as an automatic fallback.| `flashinfer_cutlass` | `auto`, `flashinfer_cudnn`, `flashinfer_cutlass`, `flashinfer_trtllm` | | `--disable-flashinfer-autotune` | Flashinfer autotune is enabled by default. Set this flag to disable the autotune. | `False` | bool flag (set to enable) | diff --git a/docs/references/environment_variables.md b/docs/references/environment_variables.md index 22d3481b3ffd..29d6b6962b0d 100644 --- a/docs/references/environment_variables.md +++ b/docs/references/environment_variables.md @@ -122,7 +122,6 @@ SGLang supports various environment variables that can be used to configure its | `SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN` | Quantize q_b_proj from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` | | `SGLANG_MOE_NVFP4_DISPATCH` | Use nvfp4 for moe dispatch (on flashinfer_cutlass or flashinfer_cutedsl moe runner backend) | `"false"` | | `SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE` | Quantize moe of nextn layer from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` | -| `SGLANG_SUPPORT_CUTLASS_BLOCK_FP8` (deprecated) | Use Cutlass kernels when running blockwise fp8 GEMM on Hopper or Blackwell GPUs. **DEPRECATED**: Please use `--fp8-gemm-backend=cutlass` instead. | `false` | | `SGLANG_QUANT_ALLOW_DOWNCASTING` | Allow weight dtype downcasting during loading (e.g., fp32 → fp16). By default, SGLang rejects this kind of downcasting when using quantization. | `false` | | `SGLANG_FP8_IGNORED_LAYERS` | A comma-separated list of layer names to ignore during FP8 quantization. For example: `model.layers.0,model.layers.1.,qkv_proj`. | `""` | diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index efba9204145d..7551b0c8402e 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -406,7 +406,6 @@ class Envs: DISABLE_OPENAPI_DOC = EnvBool(False) SGLANG_ENABLE_TORCH_INFERENCE_MODE = EnvBool(False) SGLANG_IS_FIRST_RANK_ON_NODE = EnvBool(True) - SGLANG_SUPPORT_CUTLASS_BLOCK_FP8 = EnvBool(False) SGLANG_SYNC_TOKEN_IDS_ACROSS_TP = EnvBool(False) SGLANG_ENABLE_COLOCATED_BATCH_GEN = EnvBool(False) @@ -583,10 +582,6 @@ def _convert_SGL_to_SGLANG(): "SGLANG_ENABLE_FLASHINFER_GEMM", "It will be completely removed in 0.5.7. Please use '--fp8-gemm-backend=flashinfer_trtllm' instead.", ) -_warn_deprecated_env_to_cli_flag( - "SGLANG_SUPPORT_CUTLASS_BLOCK_FP8", - "It will be completely removed in 0.5.7. Please use '--fp8-gemm-backend=cutlass' instead.", -) _warn_deprecated_env_to_cli_flag( "SGLANG_SCHEDULER_DECREASE_PREFILL_IDLE", "Please use '--enable-prefill-delayer' instead.", diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py index 72137cfec7c3..bdc0430ff3ae 100755 --- a/python/sglang/srt/layers/quantization/fp8_utils.py +++ b/python/sglang/srt/layers/quantization/fp8_utils.py @@ -7,7 +7,6 @@ import torch -from sglang.srt.environ import envs from sglang.srt.layers import deep_gemm_wrapper from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8 from sglang.srt.layers.quantization.mxfp4_tensor import MXFP4QuantizeUtil @@ -453,19 +452,6 @@ def initialize_fp8_gemm_config(server_args: ServerArgs) -> None: global FP8_GEMM_RUNNER_BACKEND backend = server_args.fp8_gemm_runner_backend - - # TODO(brayden): Remove env-based overrides in v0.5.7, they will be fully removed in v0.5.7. - # Only check environment variables when the server args is not set, server args should take priority. - if backend == "auto": - if envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get(): - backend = "cutlass" - else: - if envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get(): - logger.warning( - f"FP8 GEMM backend set to '{backend}' via --fp8-gemm-backend overrides " - "SGLANG_SUPPORT_CUTLASS_BLOCK_FP8. Using server argument value." - ) - if backend == "auto" and is_sm120_supported(): # TODO(brayden): Verify if CUTLASS can be set by default once SwapAB is supported backend = "triton" From aa0bc906129c6815156665f1b5d3f4c8a9e833b9 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 27 Mar 2026 01:05:44 -0700 Subject: [PATCH 4/4] upd --- python/sglang/srt/environ.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index 7551b0c8402e..e96abfcb1911 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -545,9 +545,6 @@ def _warn_deprecated_env_to_cli_flag(env_name: str, suggestion: str): def _convert_SGL_to_SGLANG(): _print_deprecated_env("SGLANG_LOG_GC", "SGLANG_GC_LOG") - _print_deprecated_env( - "SGLANG_ENABLE_FLASHINFER_FP8_GEMM", "SGLANG_ENABLE_FLASHINFER_GEMM" - ) _print_deprecated_env( "SGLANG_MOE_NVFP4_DISPATCH", "SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH" ) @@ -578,10 +575,6 @@ def _convert_SGL_to_SGLANG(): _convert_SGL_to_SGLANG() -_warn_deprecated_env_to_cli_flag( - "SGLANG_ENABLE_FLASHINFER_GEMM", - "It will be completely removed in 0.5.7. Please use '--fp8-gemm-backend=flashinfer_trtllm' instead.", -) _warn_deprecated_env_to_cli_flag( "SGLANG_SCHEDULER_DECREASE_PREFILL_IDLE", "Please use '--enable-prefill-delayer' instead.",