diff --git a/docker/Dockerfile b/docker/Dockerfile
index 2e57ed442e20..ae251746bb9d 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,7 +20,7 @@ ARG PIP_DEFAULT_INDEX
 ARG UBUNTU_MIRROR
 ARG GITHUB_ARTIFACTORY=github.com
 ARG INSTALL_FLASHINFER_JIT_CACHE=0
-ARG FLASHINFER_VERSION=0.6.8.post1
+ARG FLASHINFER_VERSION=0.6.11
 ARG MOONCAKE_VERSION=0.3.10.post2
 #if need other arg please add in MOONCAKE_COMPILE_ARG
 ARG MOONCAKE_COMPILE_ARG="-DUSE_HTTP=ON -DUSE_MNNVL=ON -DUSE_CUDA=ON -DWITH_EP=ON"
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 477b2fe8e1f3..59cbd8e45aa8 100755
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -27,8 +27,8 @@ dependencies = [
   "datasets",
   "einops",
   "fastapi",
-  "flashinfer_python==0.6.8.post1", # keep it aligned with jit-cache version in Dockerfile
-  "flashinfer_cubin==0.6.8.post1",
+  "flashinfer_python==0.6.11", # keep it aligned with jit-cache version in Dockerfile
+  "flashinfer_cubin==0.6.11",
   "gguf",
   "interegular",
   "llguidance>=0.7.11,<0.8.0",
@@ -37,7 +37,7 @@ dependencies = [
   "ninja",
   "easydict",  # Required by remote model code (e.g. DeepSeek-OCR) loaded via trust_remote_code; validated by transformers 5.4+ check_imports
   "numpy",
-  "nvidia-cutlass-dsl==4.4.2",
+  "nvidia-cutlass-dsl==4.5.0",
   "nvidia-ml-py",
   "openai-harmony==0.0.4",
   "openai==2.6.1",
@@ -53,7 +53,7 @@ dependencies = [
   "pydantic",
   "python-multipart",
   "pyzmq>=25.1.2",
-  "quack-kernels>=0.3.0",
+  "quack-kernels>=0.4.1",
   "requests",
   "scipy",
   "sentencepiece",
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
index a94f9d7ed26d..6ce2867acd57 100644
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -1200,7 +1200,7 @@ def _set_envs_and_config(server_args: ServerArgs):
         if server_args.attention_backend == "flashinfer":
             assert_pkg_version(
                 "flashinfer_python",
-                "0.6.8.post1",
+                "0.6.11",
                 "Please uninstall the old version and "
                 "reinstall the latest version by following the instructions "
                 "at https://docs.flashinfer.ai/installation.html.",
diff --git a/python/sglang/srt/layers/flashinfer_comm_fusion.py b/python/sglang/srt/layers/flashinfer_comm_fusion.py
index bca28f3e211c..c66c2cda4d8a 100644
--- a/python/sglang/srt/layers/flashinfer_comm_fusion.py
+++ b/python/sglang/srt/layers/flashinfer_comm_fusion.py
@@ -383,6 +383,11 @@ def initialize(
                 hidden_dim=hidden_dim,
                 dtype=dtype,
                 force_oneshot_support=bool(use_oneshot),
+                # Pin the symmetric-memory rendezvous to the actual
+                # subgroup. Without this, flashinfer >=0.6.10 falls back
+                # to WORLD and TP/EP/CP subgroup peers get addressed
+                # incorrectly (kernel hangs in cuda-graph warmup).
+                group=device_group,
             )
             if (
                 _TorchDistBackend is not None
@@ -515,8 +520,6 @@ def ensure_workspace_initialized(
     if not is_flashinfer_available() or _flashinfer_comm is None:
         return False
 
-    tp_coordinator = get_tp_group()
-
     if use_attn_tp_group:
         world_size = get_attn_tensor_model_parallel_world_size()
         rank = get_attn_tensor_model_parallel_rank()
@@ -531,17 +534,12 @@ def ensure_workspace_initialized(
             rank = get_moe_tensor_parallel_rank()
             coordinator = get_moe_tp_group()
 
-    # When the sub-group IS the full TP group, pass None so the workspace
-    # uses the default process group directly (no TorchDistBackend needed).
-    # For true sub-groups, use NCCL device_group for GPU/device mapping and
-    # GLOO cpu_group for metadata broadcasts (avoids NCCL collectives that
-    # interfere with CUDA graph capture).
-    if coordinator.device_group is tp_coordinator.device_group:
-        device_group = None
-        cpu_group = None
-    else:
-        device_group = coordinator.device_group
-        cpu_group = coordinator.cpu_group
+    # Always pass the coordinator's groups: flashinfer >=0.6.10 reads the
+    # rendezvous group from `group=...` (falling back to WORLD when None),
+    # so leaving it None silently rendezvouses on WORLD and the kernel ends
+    # up addressing the wrong peers in TP/EP/CP subgroup setups.
+    device_group = coordinator.device_group
+    cpu_group = coordinator.cpu_group
 
     if world_size <= 1:
         return False
diff --git a/python/sglang/srt/layers/quantization/fp4_utils.py b/python/sglang/srt/layers/quantization/fp4_utils.py
index 96409750cbf7..a7a64f25e99e 100644
--- a/python/sglang/srt/layers/quantization/fp4_utils.py
+++ b/python/sglang/srt/layers/quantization/fp4_utils.py
@@ -34,13 +34,13 @@ def _flashinfer_fp4_quantize_impl(
         enable_pdl: Optional[bool] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         return _flashinfer_fp4_quantize(
-            input,
-            global_scale,
-            sf_vec_size,
-            sf_use_ue8m0,
-            is_sf_swizzled_layout,
-            is_sf_8x4_layout,
-            enable_pdl,
+            input=input,
+            global_scale=global_scale,
+            sf_vec_size=sf_vec_size,
+            sf_use_ue8m0=sf_use_ue8m0,
+            is_sf_swizzled_layout=is_sf_swizzled_layout,
+            is_sf_8x4_layout=is_sf_8x4_layout,
+            enable_pdl=enable_pdl,
             backend=_flashinfer_fp4_quantize_backend,
         )
 
diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py
index c91fcdcfbebf..09d86aae35d7 100644
--- a/python/sglang/srt/utils/common.py
+++ b/python/sglang/srt/utils/common.py
@@ -1077,7 +1077,7 @@ def check_pkg_version_at_least(pkg: str, min_version: str) -> bool:
 
     Args:
         pkg: Package name (distribution name, e.g., "flashinfer-python")
-        min_version: Minimum version required (e.g., "0.6.8.post1")
+        min_version: Minimum version required (e.g., "0.6.10.post1")
 
     Returns:
         True if package is installed and version >= min_version, False otherwise
diff --git a/test/registered/moe/test_cutedsl_moe.py b/test/registered/moe/test_cutedsl_moe.py
index fcc0ff0e2911..b29a48e3d9e8 100644
--- a/test/registered/moe/test_cutedsl_moe.py
+++ b/test/registered/moe/test_cutedsl_moe.py
@@ -899,13 +899,14 @@ def test_v1_masked_kernel_bf16_input(self):
                         masked_m.to(hidden_states.device),
                     )
 
+                    a_global_scale = input_global_scale[:1]
                     a_fp4, a_scale_interleaved = fp4_quantize(
-                        hidden_states, input_global_scale
+                        hidden_states, a_global_scale
                     )
                     a_in_dtype = dequantize_nvfp4_to_dtype(
                         a_fp4,
                         a_scale_interleaved,
-                        input_global_scale,
+                        a_global_scale,
                         dtype=hidden_states.dtype,
                         device=hidden_states.device,
                         block_size=16,
@@ -1077,11 +1078,12 @@ def test_v1_masked_kernel_rejects_v2_w13_layout(self):
                 masked_m.to(device),
             )
 
-            a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, input_global_scale)
+            a_global_scale = input_global_scale[:1]
+            a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, a_global_scale)
             a_in_dtype = dequantize_nvfp4_to_dtype(
                 a_fp4,
                 a_scale_interleaved,
-                input_global_scale,
+                a_global_scale,
                 dtype=hidden_states.dtype,
                 device=device,
                 block_size=16,
@@ -1251,11 +1253,12 @@ def test_v1_masked_kernel_fp4_input(self):
             )
 
             # PyTorch reference (same as the bf16 input test)
-            a_fp4, a_scale = fp4_quantize(hidden_states, input_gs)
+            a_gs = input_gs[:1]
+            a_fp4, a_scale = fp4_quantize(hidden_states, a_gs)
             a_deq = dequantize_nvfp4_to_dtype(
                 a_fp4,
                 a_scale,
-                input_gs,
+                a_gs,
                 dtype=torch.bfloat16,
                 device=device,
                 block_size=16,