sgl-project · Fridge003 · Jan 16, 2026 · Dec 21, 2025 · Jan 8, 2026 · Jan 8, 2026
@@ -19,7 +19,7 @@ ARG PIP_DEFAULT_INDEX
 ARG UBUNTU_MIRROR
 ARG GITHUB_ARTIFACTORY=github.com
 ARG INSTALL_FLASHINFER_JIT_CACHE=0
-ARG FLASHINFER_VERSION=0.5.3
+ARG FLASHINFER_VERSION=0.6.1
 
 ENV DEBIAN_FRONTEND=noninteractive \
     CUDA_HOME=/usr/local/cuda \
@@ -304,7 +304,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 # Patching packages for CUDA 12/13 compatibility
 # TODO: Remove when torch version covers these packages
-# TODO: Move cutlass-dsl to pyproject.toml after drivers on CI runners are updated
 RUN --mount=type=cache,target=/root/.cache/pip if [ "${CUDA_VERSION%%.*}" = "12" ]; then \
     python3 -m pip install nvidia-nccl-cu12==2.28.3 --force-reinstall --no-deps ; \
     python3 -m pip install nvidia-cudnn-cu12==9.16.0.29 --force-reinstall --no-deps ; \

@@ -28,8 +28,8 @@ dependencies = [
   "datasets",
   "einops",
   "fastapi",
-  "flashinfer_python==0.5.3", # keep it aligned with jit-cache version in Dockerfile
-  "flashinfer_cubin==0.5.3",
+  "flashinfer_python==0.6.1", # keep it aligned with jit-cache version in Dockerfile
+  "flashinfer_cubin==0.6.1",
   "gguf",
   "hf_transfer",
   "huggingface_hub",

@@ -800,7 +800,7 @@ def _set_envs_and_config(server_args: ServerArgs):
         if server_args.attention_backend == "flashinfer":
             assert_pkg_version(
                 "flashinfer_python",
-                "0.5.3",
+                "0.6.1",
                 "Please uninstall the old version and "
                 "reinstall the latest version by following the instructions "
                 "at https://docs.flashinfer.ai/installation.html.",

@@ -1299,7 +1299,6 @@ def forward_impl(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
             local_expert_offset=self.moe_ep_rank * self.num_local_experts,
             local_num_experts=self.num_local_experts,
             routed_scaling_factor=self.moe_runner_config.routed_scaling_factor,
-            tile_tokens_dim=None,
             # Respect the routing method configured for this layer (e.g., Renormalize for Qwen3),
             # instead of always assuming DeepSeekV3.
             routing_method_type=(

@@ -190,7 +190,6 @@ def fused_experts_none_to_flashinfer_trtllm_fp8(
                     if runner_config.routed_scaling_factor is not None
                     else 1.0
                 ),
-                tile_tokens_dim=None,
                 routing_method_type=routing_method_type,
                 use_shuffled_weight=False,
                 tune_max_num_tokens=next_power_of_2(a_q.shape[0]),

@@ -537,7 +537,6 @@ def apply_with_router_logits(
             local_expert_offset=layer.moe_ep_rank * layer.num_local_experts,
             local_num_experts=layer.num_local_experts,
             routed_scaling_factor=routed_scaling_factor,
-            tile_tokens_dim=None,
             routing_method_type=layer.routing_method_type,
             do_finalize=True,
             tune_max_num_tokens=next_power_of_2(hs_fp4.shape[0]),

@@ -783,7 +783,6 @@ def apply(
                         else 1.0
                     ),
                     use_routing_scales_on_input=use_routing_scales_on_input,
-                    tile_tokens_dim=None,
                     routing_method_type=routing_method_type,
                     tune_max_num_tokens=next_power_of_2(x.shape[0]),
                 )

@@ -674,7 +674,6 @@ def apply(
                 layer.moe_ep_rank * layer.num_local_experts,  # local_expert_offset
                 layer.num_local_experts,  # local num experts
                 None,
-                None,  # tile_tokens_dim
                 1,  # routing_method_type, renormalize
                 True,  # do finalize
                 tune_max_num_tokens=next_power_of_2(x_quant.shape[0]),

diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py
@@ -2743,6 +2743,7 @@ def is_fa3_default_architecture(hf_config):
         "Olmo2ForCausalLM",
         "Gemma2ForCausalLM",
         "Gemma3ForConditionalGeneration",
+        "MixtralForCausalLM",
         "Qwen2ForCausalLM",
         "Qwen3ForCausalLM",
         "Qwen3MoeForCausalLM",

diff --git a/scripts/ci/ci_install_dependency.sh b/scripts/ci/ci_install_dependency.sh
@@ -5,7 +5,7 @@ set -euxo pipefail
 # Set up environment variables
 IS_BLACKWELL=${IS_BLACKWELL:-0}
 CU_VERSION="cu129"
-FLASHINFER_VERSION=0.5.3
+FLASHINFER_VERSION=0.6.1
 OPTIONAL_DEPS="${1:-}"
 
 # Detect system architecture