diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index d4192f947744..a590f8d0891b 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -1487,9 +1487,9 @@ def _handle_gpu_memory_settings(self, gpu_mem):
                 )
                 self.cuda_graph_bs = self._generate_cpu_graph_batch_sizes()
 
-            assert (
-                self.torch_compile_max_bs > 0
-            ), "cuda_graph_bs should contain positive batch sizes"
+            assert self.torch_compile_max_bs > 0, (
+                "cuda_graph_bs should contain positive batch sizes"
+            )
             self.cuda_graph_max_bs = self.torch_compile_max_bs
 
         if self.piecewise_cuda_graph_max_tokens is None:
@@ -1815,12 +1815,12 @@ def _handle_model_specific_adjustments(self):
                         else:
                             self.enable_dp_attention = True
                             self.moe_dense_tp_size = 1
-                            assert (
-                                self.dp_size == 1
-                            ), "For round-robin split mode, dp attention is not supported."
-                        assert (
-                            self.tp_size <= 8
-                        ), "Context parallel only supports single machine (tp_size <= 8). Cross-machine CP has precision issues."
+                            assert self.dp_size == 1, (
+                                "For round-robin split mode, dp attention is not supported."
+                            )
+                        assert self.tp_size <= 8, (
+                            "Context parallel only supports single machine (tp_size <= 8). Cross-machine CP has precision issues."
+                        )
                         self.attn_cp_size = self.tp_size // self.dp_size
 
                         logger.warning(
@@ -1861,9 +1861,9 @@ def _handle_model_specific_adjustments(self):
                     self._set_default_dsa_backends(self.kv_cache_dtype, major)
 
                 if self.enable_dsa_prefill_context_parallel:
-                    assert (
-                        self.disaggregation_mode != "decode"
-                    ), "CP is only supported for prefill when PD disaggregation, please remove --enable-dsa-prefill-context-parallel."
+                    assert self.disaggregation_mode != "decode", (
+                        "CP is only supported for prefill when PD disaggregation, please remove --enable-dsa-prefill-context-parallel."
+                    )
 
             else:
                 # DeepSeek V3/R1/V3.1
@@ -2098,9 +2098,9 @@ def _handle_model_specific_adjustments(self):
                         )
 
             if self.moe_runner_backend == "triton_kernel":
-                assert (
-                    self.ep_size == 1
-                ), "Triton kernel MoE is only supported when ep_size == 1"
+                assert self.ep_size == 1, (
+                    "Triton kernel MoE is only supported when ep_size == 1"
+                )
 
         elif model_arch in MIMO_V2_MODEL_ARCHS:
             if model_arch == "MiMoV2ForCausalLM" and not self.encoder_only:
@@ -2182,7 +2182,9 @@ def _handle_model_specific_adjustments(self):
                 "ascend",
                 "trtllm_mha",
                 "intel_xpu",
-            }, f"fa3, aiter, triton, ascend, trtllm_mha or intel_xpu is required for Llama4 model but got {self.attention_backend}"
+            }, (
+                f"fa3, aiter, triton, ascend, trtllm_mha or intel_xpu is required for Llama4 model but got {self.attention_backend}"
+            )
             if is_sm100_supported() and self.moe_runner_backend == "auto":
                 if self.quantization in {"fp8", "modelopt_fp8"}:
                     self.moe_runner_backend = "flashinfer_trtllm"
@@ -2297,6 +2299,90 @@ def _handle_model_specific_adjustments(self):
                     "(dense Gemma-4; MoE-specific 0.15 override skipped "
                     "to avoid SWA pool starvation)."
                 )
+
+                # Dense Gemma-4 prefill scheduler tuning.
+                #
+                # Measured on H100 TP=2 with gemma-4-31B-it + FROZEN_KV_MTP on
+                # the campaign workload (8000-token random input, 80
+                # concurrent prompts):
+                #
+                # * Default chunked_prefill_size=8192 admits one full prefill
+                #   per scheduler step (one 8k prompt fills the whole chunk),
+                #   leaving the decode batch starved and pinning peak
+                #   #running-req at 11-12.  Reducing the chunk size to 4096
+                #   lets the scheduler pack two partial prefills per step and
+                #   pushes peak #running-req to ~23, lifting summarisation
+                #   throughput from 316 -> 421 tok/s (+33 %) and dropping
+                #   median TTFT from 78.6 s -> 75.7 s, all while preserving
+                #   chat behaviour (median TPOT regresses < 1 ms at conc=80).
+                #
+                # * The default ``mem_fraction_static`` auto-tunes to 0.778
+                #   for this checkpoint on 80 GB H100, leaving ~16 GB on the
+                #   table per GPU after model+CUDA-graph load.  Bumping the
+                #   floor to 0.88 grows ``max_total_num_tokens`` 68k -> 106k
+                #   (+27 %) and brings KV pool into parity with vLLM nightly
+                #   at the same workload (vLLM: 109k tokens, 27.6 GiB KV).
+                #
+                # Both overrides are dense-only because the MoE-only
+                # ``swa_full_tokens_ratio=0.15`` branch above already retunes
+                # MoE Gemma-4 along a different axis (full vs SWA pool
+                # split), and pushing both at once on the smaller MoE budget
+                # causes KV pool starvation under concurrent long-context
+                # loads (the same failure mode that motivated the MoE-only
+                # swa-ratio gate).
+                #
+                # Respect explicit user overrides via the same predicate
+                # pattern used by the swa_ratio override.
+                # ``_handle_gpu_memory_settings`` runs before us and has
+                # already auto-tuned ``chunked_prefill_size`` to 8192 on
+                # this hardware class for the original ``None`` user value.
+                # Detect "still at the auto-tune ceiling" by comparing
+                # against 8192; user-passed lower values (e.g. 2048) stay
+                # respected.  User-passed higher values likewise stay
+                # respected -- this dense-Gemma-4 tune only nudges the
+                # ceiling down.
+                _DENSE_GEMMA4_CHUNK_CEIL = 4096
+                if (
+                    self.chunked_prefill_size is None
+                    or self.chunked_prefill_size >= 8192
+                ):
+                    _prev = self.chunked_prefill_size
+                    self.chunked_prefill_size = _DENSE_GEMMA4_CHUNK_CEIL
+                    logger.info(
+                        "Capping chunked_prefill_size at %s for %s "
+                        "(was %s; dense Gemma-4 with FROZEN_KV_MTP admits "
+                        "only one full prefill per scheduler step at 8192, "
+                        "leaving the decode batch starved at high "
+                        "concurrency; capping at 4096 lifts summ throughput "
+                        "+33%% on the campaign workload).",
+                        _DENSE_GEMMA4_CHUNK_CEIL,
+                        model_arch,
+                        _prev,
+                    )
+                # ``_handle_gpu_memory_settings`` runs before us and may
+                # have auto-tuned the value into a low setting; only nudge
+                # upward (respect user-set higher values).  The auto-tune
+                # path leaves the value below 0.85 on 80 GB H100 TP=2 for
+                # this checkpoint, so a 0.88 floor still leaves headroom for
+                # CUDA-graph capture while substantially growing the KV
+                # pool.
+                _DENSE_GEMMA4_MEM_FLOOR = 0.88
+                if (
+                    self.mem_fraction_static is None
+                    or self.mem_fraction_static < _DENSE_GEMMA4_MEM_FLOOR
+                ):
+                    _prev = self.mem_fraction_static
+                    self.mem_fraction_static = _DENSE_GEMMA4_MEM_FLOOR
+                    logger.info(
+                        "Bumping mem_fraction_static from %s to %s for %s "
+                        "(dense Gemma-4: the auto-tuned ceiling leaves ~16 "
+                        "GB per GPU unused on H100 TP=2; floor grows the "
+                        "KV pool ~27%% to bring parity with vLLM nightly).",
+                        _prev,
+                        _DENSE_GEMMA4_MEM_FLOOR,
+                        model_arch,
+                    )
+
         elif model_arch == "MossVLForConditionalGeneration":
             if self.is_attention_backend_not_set():
                 self.prefill_attention_backend = "flashinfer"
@@ -2316,9 +2402,9 @@ def _handle_model_specific_adjustments(self):
                 self.disable_hybrid_swa_memory = True
                 # https://docs.sglang.ai/advanced_features/attention_backend.html
                 accepted_backends = ["fa3", "triton", "trtllm_mha"]
-                assert (
-                    self.attention_backend in accepted_backends
-                ), f"One of the attention backends in {accepted_backends} is required for {model_arch}, but got {self.attention_backend}"
+                assert self.attention_backend in accepted_backends, (
+                    f"One of the attention backends in {accepted_backends} is required for {model_arch}, but got {self.attention_backend}"
+                )
         elif model_arch in ["Olmo2ForCausalLM"]:
             # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with Olmo3 model.
             logger.warning(
@@ -2337,9 +2423,9 @@ def _handle_model_specific_adjustments(self):
             # Flashinfer appears to degrade performance when sliding window attention
             # is used for the Olmo2 architecture. Olmo2 does not use sliding window attention
             # but Olmo3 does.
-            assert (
-                self.attention_backend != "flashinfer"
-            ), "FlashInfer backend can significantly degrade the performance of Olmo3 models."
+            assert self.attention_backend != "flashinfer", (
+                "FlashInfer backend can significantly degrade the performance of Olmo3 models."
+            )
 
             logger.info(
                 f"Using {self.attention_backend} as attention backend for {model_arch}."
@@ -2576,9 +2662,9 @@ def _handle_mamba_radix_cache(
             return
 
         if not support_mamba_cache_extra_buffer:
-            assert (
-                not self.enable_mamba_extra_buffer()
-            ), f"mamba extra_buffer is not supported for {model_arch} model"
+            assert not self.enable_mamba_extra_buffer(), (
+                f"mamba extra_buffer is not supported for {model_arch} model"
+            )
 
         if self.enable_mamba_extra_buffer():  # extra_buffer
             if self.disable_radix_cache:
@@ -2588,23 +2674,25 @@ def _handle_mamba_radix_cache(
                     "Please use --mamba-scheduler-strategy no_buffer instead."
                 )
 
-            assert (
-                is_cuda() or is_musa() or is_npu()
-            ), "Mamba extra_buffer is only supported on CUDA and MUSA and NPU devices with FLA backend"
+            assert is_cuda() or is_musa() or is_npu(), (
+                "Mamba extra_buffer is only supported on CUDA and MUSA and NPU devices with FLA backend"
+            )
             if self.speculative_num_draft_tokens is not None:
-                assert (
-                    self.mamba_track_interval >= self.speculative_num_draft_tokens
-                ), f"mamba_track_interval {self.mamba_track_interval} must be greater than or equal to speculative_num_draft_tokens {self.speculative_num_draft_tokens}"
+                assert self.mamba_track_interval >= self.speculative_num_draft_tokens, (
+                    f"mamba_track_interval {self.mamba_track_interval} must be greater than or equal to speculative_num_draft_tokens {self.speculative_num_draft_tokens}"
+                )
 
             if self.page_size is not None:
-                assert (
-                    self.mamba_track_interval % self.page_size == 0
-                ), f"mamba_track_interval {self.mamba_track_interval} must be divisible by page_size {self.page_size}"
+                assert self.mamba_track_interval % self.page_size == 0, (
+                    f"mamba_track_interval {self.mamba_track_interval} must be divisible by page_size {self.page_size}"
+                )
                 assert (
                     max(FLA_CHUNK_SIZE, self.page_size)
                     % min(FLA_CHUNK_SIZE, self.page_size)
                     == 0
-                ), f"For SSM models with extra buffer, either FLA_CHUNK_SIZE or page_size must be divisible by the other, got {FLA_CHUNK_SIZE=}, {self.page_size=}"
+                ), (
+                    f"For SSM models with extra buffer, either FLA_CHUNK_SIZE or page_size must be divisible by the other, got {FLA_CHUNK_SIZE=}, {self.page_size=}"
+                )
         elif not self.disable_radix_cache:  # no_buffer
             if self.page_size is not None and self.page_size != 1:
                 logger.warning(
@@ -2749,9 +2837,9 @@ def _handle_attention_backend_compatibility(self):
                 "Cuda graph is disabled because of using torch Flex Attention backend"
             )
             self.disable_cuda_graph = True
-            assert (
-                self.speculative_algorithm is None
-            ), "Speculative decoding is currently not supported with Flex Attention backend"
+            assert self.speculative_algorithm is None, (
+                "Speculative decoding is currently not supported with Flex Attention backend"
+            )
 
         # Whisper's encoder token padding conflicts with prefix caching.
         # Only disable for Whisper; other encoder-decoder models (e.g., mllama) use radix cache.
@@ -3097,40 +3185,40 @@ def _handle_linear_attn_backend(self):
     def _handle_context_parallelism(self):
         if self.attn_cp_size > 1:
             # The tp_size is the world size, not the real tensor parallel size
-            assert (
-                self.tp_size % self.attn_cp_size == 0
-            ), "tp_size must be divisible by attn_cp_size"
-            assert (
-                self.tp_size % (self.dp_size * self.attn_cp_size) == 0
-            ), "tp_size must be divisible by dp_size * attn_cp_size"
+            assert self.tp_size % self.attn_cp_size == 0, (
+                "tp_size must be divisible by attn_cp_size"
+            )
+            assert self.tp_size % (self.dp_size * self.attn_cp_size) == 0, (
+                "tp_size must be divisible by dp_size * attn_cp_size"
+            )
 
-            assert (
-                not self.enable_aiter_allreduce_fusion
-            ), "Aiter allreduce fusion is not supported with context parallelism"
+            assert not self.enable_aiter_allreduce_fusion, (
+                "Aiter allreduce fusion is not supported with context parallelism"
+            )
 
         if self.moe_dp_size > 1:
             # The tp_size is the world size, not the real tensor parallel size
-            assert (
-                self.tp_size % self.moe_dp_size == 0
-            ), "tp_size must be divisible by moe_dp_size"
-            assert (
-                self.ep_size * self.moe_dp_size <= self.tp_size
-            ), "ep_size * moe_dp_size must be less than or equal to tp_size"
+            assert self.tp_size % self.moe_dp_size == 0, (
+                "tp_size must be divisible by moe_dp_size"
+            )
+            assert self.ep_size * self.moe_dp_size <= self.tp_size, (
+                "ep_size * moe_dp_size must be less than or equal to tp_size"
+            )
             assert self.pp_size == 1, "PP is not supported with context parallelism"
 
             if self.ep_size > 1:
-                assert (
-                    self.ep_size * self.moe_dp_size == self.tp_size
-                ), "ep_size * moe_dp_size must be equal to tp_size"
+                assert self.ep_size * self.moe_dp_size == self.tp_size, (
+                    "ep_size * moe_dp_size must be equal to tp_size"
+                )
 
-            assert (
-                not self.enable_aiter_allreduce_fusion
-            ), "Aiter allreduce fusion is not supported with context parallelism"
+            assert not self.enable_aiter_allreduce_fusion, (
+                "Aiter allreduce fusion is not supported with context parallelism"
+            )
 
         if self.attn_cp_size != self.moe_dp_size:
-            assert (
-                self.moe_dp_size == 1
-            ), "attn_cp_size != moe_dp_size is only supported when moe_dp_size == 1"
+            assert self.moe_dp_size == 1, (
+                "attn_cp_size != moe_dp_size is only supported when moe_dp_size == 1"
+            )
 
     def _handle_data_parallelism(self):
         if self.dp_size == 1:
@@ -3146,9 +3234,9 @@ def _handle_data_parallelism(self):
             )
 
         if self.enable_dp_lm_head:
-            assert (
-                self.enable_dp_attention
-            ), "Please enable dp attention when setting enable_dp_lm_head. "
+            assert self.enable_dp_attention, (
+                "Please enable dp attention when setting enable_dp_lm_head. "
+            )
 
     def _handle_moe_kernel_config(self):
         if self.quantization == "mxfp8":
@@ -3172,20 +3260,26 @@ def _handle_moe_kernel_config(self):
                 "modelopt_fp8",
                 "modelopt_mixed",
                 None,
-            ], f"Invalid quantization '{self.quantization}'. \nFlashInfer Cutlass MOE supports only: 'modelopt_fp4', 'modelopt_fp8', 'modelopt_mixed', or bfloat16 (None)."
+            ], (
+                f"Invalid quantization '{self.quantization}'. \nFlashInfer Cutlass MOE supports only: 'modelopt_fp4', 'modelopt_fp8', 'modelopt_mixed', or bfloat16 (None)."
+            )
             assert self.ep_size in [
                 1,
                 self.tp_size,
-            ], "The expert parallel size must be 1 or the same as the tensor parallel size"
+            ], (
+                "The expert parallel size must be 1 or the same as the tensor parallel size"
+            )
 
         if self.moe_runner_backend == "flashinfer_cutedsl":
-            assert self.quantization in [
-                "modelopt_fp4"
-            ], f"Invalid quantization '{self.quantization}'. \nFlashInfer CuteDSL MOE currently supports only: 'modelopt_fp4'."
+            assert self.quantization in ["modelopt_fp4"], (
+                f"Invalid quantization '{self.quantization}'. \nFlashInfer CuteDSL MOE currently supports only: 'modelopt_fp4'."
+            )
             assert self.ep_size in [
                 1,
                 self.tp_size,
-            ], "The expert parallel size must be 1 or the same as the tensor parallel size"
+            ], (
+                "The expert parallel size must be 1 or the same as the tensor parallel size"
+            )
             assert self.moe_a2a_backend in [
                 "none",
                 "deepep",
@@ -3208,7 +3302,9 @@ def _handle_moe_kernel_config(self):
                 "modelopt_mixed",
                 "compressed-tensors",
                 None,
-            ], f"Invalid quantization '{self.quantization}'. \nFlashInfer TRTLLM MOE supports only: 'modelopt_fp4', 'fp8', 'modelopt_fp8', 'modelopt_mixed', 'compressed-tensors', or bfloat16 (None)."
+            ], (
+                f"Invalid quantization '{self.quantization}'. \nFlashInfer TRTLLM MOE supports only: 'modelopt_fp4', 'fp8', 'modelopt_fp8', 'modelopt_mixed', 'compressed-tensors', or bfloat16 (None)."
+            )
             self.disable_shared_experts_fusion = True
             logger.warning(
                 "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
@@ -3220,7 +3316,9 @@ def _handle_moe_kernel_config(self):
                 "mxfp8",
                 "modelopt_fp4",
                 None,
-            ], f"Invalid quantization '{self.quantization}'. \nFlashInfer TRTLLM routed MOE supports only: 'fp8', 'mxfp8', 'modelopt_fp4', or bfloat16 (None)."
+            ], (
+                f"Invalid quantization '{self.quantization}'. \nFlashInfer TRTLLM routed MOE supports only: 'fp8', 'mxfp8', 'modelopt_fp4', or bfloat16 (None)."
+            )
             self.disable_shared_experts_fusion = True
             logger.warning(
                 "FlashInfer TRTLLM routed MoE is enabled. --disable-shared-experts-fusion is automatically set."
@@ -3239,9 +3337,9 @@ def _handle_moe_kernel_config(self):
             "fp8",
             "mxfp8",
         ]:
-            assert (
-                self.ep_size == 1
-            ), "FP8/MXFP8 Cutlass MoE is only supported with ep_size == 1"
+            assert self.ep_size == 1, (
+                "FP8/MXFP8 Cutlass MoE is only supported with ep_size == 1"
+            )
 
         # TODO(yuwei): Fix piecewise cuda graph support for bypassed topk MoE backends.
         # Exception: GptOssForCausalLM wraps the entire MoE block in its own
@@ -3328,13 +3426,13 @@ def _handle_a2a_moe(self):
                     f"Wrong value of {fuse_mode=}, the NPU only support 1 or 2."
                 )
             elif fuse_mode == 2:
-                assert (
-                    self.quantization == "modelslim"
-                ), "When fuse_mode is set to 2, the NPU supports only ModelSlim quantization."
+                assert self.quantization == "modelslim", (
+                    "When fuse_mode is set to 2, the NPU supports only ModelSlim quantization."
+                )
         if self.moe_a2a_backend == "flashinfer":
-            assert (
-                self.enable_dp_attention and self.dp_size == self.tp_size
-            ), "Flashinfer MoE A2A is only supported with dp_size == tp_size and --enable-dp-attention"
+            assert self.enable_dp_attention and self.dp_size == self.tp_size, (
+                "Flashinfer MoE A2A is only supported with dp_size == tp_size and --enable-dp-attention"
+            )
             self.ep_size = self.tp_size
             logger.warning(
                 f"Flashinfer MoE A2A is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
@@ -3353,7 +3451,9 @@ def _handle_a2a_moe(self):
             assert self.moe_runner_backend in [
                 "flashinfer_cutlass",
                 "flashinfer_cutedsl",
-            ], "Flashinfer MoE A2A is only supported with flashinfer_cutlass or flashinfer_cutedsl moe runner backend"
+            ], (
+                "Flashinfer MoE A2A is only supported with flashinfer_cutlass or flashinfer_cutedsl moe runner backend"
+            )
             if (
                 self.moe_runner_backend == "flashinfer_cutedsl"
                 and self.max_prefill_tokens is not None
@@ -3401,7 +3501,9 @@ def _handle_a2a_moe(self):
             if self.chunked_prefill_size > 0 and self.disaggregation_mode != "decode":
                 assert (
                     self.chunked_prefill_size
-                ) <= envs.SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK.get(), "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK (default 4096) must be larger or equal to chunked_prefill_size"
+                ) <= envs.SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK.get(), (
+                    "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK (default 4096) must be larger or equal to chunked_prefill_size"
+                )
 
     def _handle_eplb_and_dispatch(self):
         if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
@@ -3426,7 +3528,9 @@ def _handle_elastic_ep(self):
                 assert self.eplb_algorithm in [
                     "elasticity_aware",
                     "elasticity_aware_hierarchical",
-                ], "Elastic EP requires eplb_algorithm to be set to 'auto' or 'elasticity_aware(_hierarchical)'."
+                ], (
+                    "Elastic EP requires eplb_algorithm to be set to 'auto' or 'elasticity_aware(_hierarchical)'."
+                )
 
             assert self.pp_size == 1, "PP size should be set to 1 under elastic EP"
 
@@ -3435,9 +3539,9 @@ def _handle_elastic_ep(self):
                     self.mooncake_ib_device
                 )
         if self.elastic_ep_rejoin:
-            assert (
-                self.elastic_ep_backend is not None
-            ), "Elastic EP rejoin requires elastic_ep_backend to be set."
+            assert self.elastic_ep_backend is not None, (
+                "Elastic EP rejoin requires elastic_ep_backend to be set."
+            )
 
     def _handle_expert_distribution_metrics(self):
         if self.enable_expert_distribution_metrics and (
@@ -3806,9 +3910,9 @@ def _handle_pd_disaggregation(self):
                     self.mamba_scheduler_strategy = "no_buffer"
 
         elif self.disaggregation_mode == "prefill":
-            assert (
-                self.disaggregation_transfer_backend != "fake"
-            ), "Prefill server does not support 'fake' as the transfer backend"
+            assert self.disaggregation_transfer_backend != "fake", (
+                "Prefill server does not support 'fake' as the transfer backend"
+            )
 
             self.disable_cuda_graph = True
 
@@ -7031,9 +7135,9 @@ def mamba_cache_chunk_size(self) -> int:
 
     def check_server_args(self):
         # Check parallel size constraints
-        assert (
-            self.tp_size * self.pp_size
-        ) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
+        assert (self.tp_size * self.pp_size) % self.nnodes == 0, (
+            "tp_size must be divisible by number of nodes"
+        )
 
         assert (
             self.pp_max_micro_batch_size is None or self.pp_max_micro_batch_size >= 1
@@ -7053,7 +7157,9 @@ def check_server_args(self):
         if self.pp_size > 1:
             assert (
                 self.disable_overlap_schedule and self.speculative_algorithm is None
-            ), "Pipeline parallelism is not compatible with overlap schedule, speculative decoding"
+            ), (
+                "Pipeline parallelism is not compatible with overlap schedule, speculative decoding"
+            )
 
         assert not (
             self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
@@ -7080,32 +7186,32 @@ def check_server_args(self):
 
         # Check speculative decoding
         if self.speculative_algorithm is not None:
-            assert (
-                not self.enable_mixed_chunk
-            ), "enable_mixed_chunk is required for speculative decoding"
+            assert not self.enable_mixed_chunk, (
+                "enable_mixed_chunk is required for speculative decoding"
+            )
 
         # Check chunked prefill
         # Skip validation if chunked prefill is disabled (i.e., size <= 0).
         # Skip validation if disaggregation mode is decode.
         if self.chunked_prefill_size > 0 and self.disaggregation_mode != "decode":
-            assert (
-                self.chunked_prefill_size % self.page_size == 0
-            ), "chunked_prefill_size must be divisible by page_size"
+            assert self.chunked_prefill_size % self.page_size == 0, (
+                "chunked_prefill_size must be divisible by page_size"
+            )
 
         # Check pdmux
         if self.enable_pdmux:
-            assert (
-                self.pp_size == 1
-            ), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
-            assert (
-                self.chunked_prefill_size == -1
-            ), "PD-Multiplexing is not compatible with chunked prefill."
-            assert (
-                self.disaggregation_mode == "null"
-            ), "PD-Multiplexing is not compatible with disaggregation mode."
-            assert (
-                self.disable_overlap_schedule
-            ), "PD-Multiplexing is not compatible with overlap schedule."
+            assert self.pp_size == 1, (
+                "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
+            )
+            assert self.chunked_prefill_size == -1, (
+                "PD-Multiplexing is not compatible with chunked prefill."
+            )
+            assert self.disaggregation_mode == "null", (
+                "PD-Multiplexing is not compatible with disaggregation mode."
+            )
+            assert self.disable_overlap_schedule, (
+                "PD-Multiplexing is not compatible with overlap schedule."
+            )
 
             # NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
             import torch
@@ -7131,7 +7237,9 @@ def check_server_args(self):
             assert self.schedule_policy in [
                 "fcfs",
                 "lof",
-            ], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
+            ], (
+                f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
+            )
             if self.default_priority_value is None:
                 logger.warning(
                     "--default-priority-value is not set while --enable-priority-scheduling is enabled. "
@@ -7153,9 +7261,9 @@ def check_server_args(self):
 
         validate_hisparse(self)
 
-        assert (
-            self.schedule_conservativeness >= 0
-        ), "schedule_conservativeness must be non-negative"
+        assert self.schedule_conservativeness >= 0, (
+            "schedule_conservativeness must be non-negative"
+        )
 
         if self.model_impl == "mindspore":
             assert is_npu(), "MindSpore model impl is only supported on Ascend npu."
@@ -7274,9 +7382,9 @@ def check_lora_server_args(self):
                                 pinned=False,
                             )
                     elif isinstance(lora_path, dict):
-                        assert (
-                            "lora_name" in lora_path and "lora_path" in lora_path
-                        ), f"When providing LoRA paths as a list of dict, each dict should contain 'lora_name' and 'lora_path' keys. Got: {lora_path}"
+                        assert "lora_name" in lora_path and "lora_path" in lora_path, (
+                            f"When providing LoRA paths as a list of dict, each dict should contain 'lora_name' and 'lora_path' keys. Got: {lora_path}"
+                        )
                         lora_ref = LoRARef(
                             lora_id=LoRARef.deterministic_id(
                                 lora_path["lora_name"], lora_path["lora_path"]
@@ -7314,14 +7422,16 @@ def check_lora_server_args(self):
             if self.lora_target_modules:
                 self.lora_target_modules = set(self.lora_target_modules)
                 if "all" in self.lora_target_modules:
-                    assert (
-                        len(self.lora_target_modules) == 1
-                    ), "If 'all' is specified in --lora-target-modules, it should be the only module specified."
+                    assert len(self.lora_target_modules) == 1, (
+                        "If 'all' is specified in --lora-target-modules, it should be the only module specified."
+                    )
 
             # Ensure sufficient information is provided for LoRA initialization.
             assert self.lora_paths or (
                 self.max_lora_rank and self.lora_target_modules
-            ), "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."
+            ), (
+                "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."
+            )
 
             # Validate max_loaded_loras
             if self.max_loaded_loras is not None:
@@ -7343,9 +7453,9 @@ def check_lora_server_args(self):
             if self.lora_use_virtual_experts:
                 logger.info("Virtual expert computation enabled.")
 
-            assert (
-                self.lora_drain_wait_threshold >= 0.0
-            ), "--lora-drain-wait-threshold must be non-negative."
+            assert self.lora_drain_wait_threshold >= 0.0, (
+                "--lora-drain-wait-threshold must be non-negative."
+            )
 
     def validate_buckets_rule(self, arg_name: str, buckets_rule: List[str]):
         if not buckets_rule:
@@ -7357,43 +7467,45 @@ def validate_buckets_rule(self, arg_name: str, buckets_rule: List[str]):
             "tse",
             "default",
             "custom",
-        ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'custom'"
+        ], (
+            f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'custom'"
+        )
 
         if rule == "tse":
-            assert (
-                len(buckets_rule) == 4
-            ), f"{arg_name} TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got {len(buckets_rule)}"
+            assert len(buckets_rule) == 4, (
+                f"{arg_name} TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got {len(buckets_rule)}"
+            )
             try:
                 middle = float(buckets_rule[1])
                 base = float(buckets_rule[2])
                 count = int(buckets_rule[3])
             except (ValueError, IndexError):
-                assert (
-                    False
-                ), f"{arg_name} TSE rule parameters must be: ['tse', <float:middle>, <float:base>, <int:count>]"
+                assert False, (
+                    f"{arg_name} TSE rule parameters must be: ['tse', <float:middle>, <float:base>, <int:count>]"
+                )
             assert base > 1, f"{arg_name} TSE base must be larger than 1, got: {base}"
             assert count > 0, f"{arg_name} TSE count must be positive, got: {count}"
             assert middle > 0, f"{arg_name} TSE middle must be positive, got: {middle}"
 
         elif rule == "default":
-            assert (
-                len(buckets_rule) == 1
-            ), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
+            assert len(buckets_rule) == 1, (
+                f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
+            )
 
         elif rule == "custom":
-            assert (
-                len(buckets_rule) >= 2
-            ), f"{arg_name} custom rule requires at least one bucket value: ['custom', value1, ...]"
+            assert len(buckets_rule) >= 2, (
+                f"{arg_name} custom rule requires at least one bucket value: ['custom', value1, ...]"
+            )
             try:
                 bucket_values = [float(x) for x in buckets_rule[1:]]
             except ValueError:
                 assert False, f"{arg_name} custom rule bucket values must be numeric"
-            assert len(set(bucket_values)) == len(
-                bucket_values
-            ), f"{arg_name} custom rule bucket values should not contain duplicates"
-            assert all(
-                val >= 0 for val in bucket_values
-            ), f"{arg_name} custom rule bucket values should be non-negative"
+            assert len(set(bucket_values)) == len(bucket_values), (
+                f"{arg_name} custom rule bucket values should not contain duplicates"
+            )
+            assert all(val >= 0 for val in bucket_values), (
+                f"{arg_name} custom rule bucket values should be non-negative"
+            )
 
     def adjust_mem_fraction_for_vlm(self, model_config):
         vision_config = getattr(model_config.hf_config, "vision_config", None)