diff --git a/docker/Dockerfile b/docker/Dockerfile
index 699e978760fd..1e9665a04219 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -19,7 +19,7 @@ ARG PIP_DEFAULT_INDEX
 ARG UBUNTU_MIRROR
 ARG GITHUB_ARTIFACTORY=github.com
 ARG INSTALL_FLASHINFER_JIT_CACHE=0
-ARG FLASHINFER_VERSION=0.6.11.post1
+ARG FLASHINFER_VERSION=0.6.8.post1
 ARG MOONCAKE_VERSION=0.3.10.post2
 #if need other arg please add in MOONCAKE_COMPILE_ARG
 ARG MOONCAKE_COMPILE_ARG="-DUSE_HTTP=ON -DUSE_MNNVL=ON -DUSE_CUDA=ON -DWITH_EP=ON"
diff --git a/python/pyproject.toml b/python/pyproject.toml
index c1a902d7aacb..6e496bad2a38 100755
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -27,8 +27,8 @@ dependencies = [
   "datasets",
   "einops",
   "fastapi",
-  "flashinfer_python==0.6.11.post1", # keep it aligned with jit-cache version in Dockerfile
-  "flashinfer_cubin==0.6.11.post1",
+  "flashinfer_python==0.6.8.post1", # keep it aligned with jit-cache version in Dockerfile
+  "flashinfer_cubin==0.6.8.post1",
   "gguf",
   "interegular",
   "llguidance>=0.7.11,<0.8.0",
@@ -37,7 +37,7 @@ dependencies = [
   "ninja",
   "easydict",  # Required by remote model code (e.g. DeepSeek-OCR) loaded via trust_remote_code; validated by transformers 5.4+ check_imports
   "numpy",
-  "nvidia-cutlass-dsl==4.5.0",
+  "nvidia-cutlass-dsl==4.4.2",
   "nvidia-ml-py",
   "openai-harmony==0.0.4",
   "openai==2.6.1",
@@ -53,7 +53,7 @@ dependencies = [
   "pydantic",
   "python-multipart",
   "pyzmq>=25.1.2",
-  "quack-kernels>=0.4.1",
+  "quack-kernels>=0.3.0",
   "requests",
   "scipy",
   "sentencepiece",
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
index cc35d2e3c1d9..f1788a03f00e 100644
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -1201,7 +1201,7 @@ def _set_envs_and_config(server_args: ServerArgs):
         if server_args.attention_backend == "flashinfer":
             assert_pkg_version(
                 "flashinfer_python",
-                "0.6.11.post1",
+                "0.6.8.post1",
                 "Please uninstall the old version and "
                 "reinstall the latest version by following the instructions "
                 "at https://docs.flashinfer.ai/installation.html.",
diff --git a/python/sglang/srt/layers/flashinfer_comm_fusion.py b/python/sglang/srt/layers/flashinfer_comm_fusion.py
index c66c2cda4d8a..bca28f3e211c 100644
--- a/python/sglang/srt/layers/flashinfer_comm_fusion.py
+++ b/python/sglang/srt/layers/flashinfer_comm_fusion.py
@@ -383,11 +383,6 @@ def initialize(
                 hidden_dim=hidden_dim,
                 dtype=dtype,
                 force_oneshot_support=bool(use_oneshot),
-                # Pin the symmetric-memory rendezvous to the actual
-                # subgroup. Without this, flashinfer >=0.6.10 falls back
-                # to WORLD and TP/EP/CP subgroup peers get addressed
-                # incorrectly (kernel hangs in cuda-graph warmup).
-                group=device_group,
             )
             if (
                 _TorchDistBackend is not None
@@ -520,6 +515,8 @@ def ensure_workspace_initialized(
     if not is_flashinfer_available() or _flashinfer_comm is None:
         return False
 
+    tp_coordinator = get_tp_group()
+
     if use_attn_tp_group:
         world_size = get_attn_tensor_model_parallel_world_size()
         rank = get_attn_tensor_model_parallel_rank()
@@ -534,12 +531,17 @@ def ensure_workspace_initialized(
             rank = get_moe_tensor_parallel_rank()
             coordinator = get_moe_tp_group()
 
-    # Always pass the coordinator's groups: flashinfer >=0.6.10 reads the
-    # rendezvous group from `group=...` (falling back to WORLD when None),
-    # so leaving it None silently rendezvouses on WORLD and the kernel ends
-    # up addressing the wrong peers in TP/EP/CP subgroup setups.
-    device_group = coordinator.device_group
-    cpu_group = coordinator.cpu_group
+    # When the sub-group IS the full TP group, pass None so the workspace
+    # uses the default process group directly (no TorchDistBackend needed).
+    # For true sub-groups, use NCCL device_group for GPU/device mapping and
+    # GLOO cpu_group for metadata broadcasts (avoids NCCL collectives that
+    # interfere with CUDA graph capture).
+    if coordinator.device_group is tp_coordinator.device_group:
+        device_group = None
+        cpu_group = None
+    else:
+        device_group = coordinator.device_group
+        cpu_group = coordinator.cpu_group
 
     if world_size <= 1:
         return False
diff --git a/python/sglang/srt/layers/quantization/fp4_utils.py b/python/sglang/srt/layers/quantization/fp4_utils.py
index a7a64f25e99e..96409750cbf7 100644
--- a/python/sglang/srt/layers/quantization/fp4_utils.py
+++ b/python/sglang/srt/layers/quantization/fp4_utils.py
@@ -34,13 +34,13 @@ def _flashinfer_fp4_quantize_impl(
         enable_pdl: Optional[bool] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         return _flashinfer_fp4_quantize(
-            input=input,
-            global_scale=global_scale,
-            sf_vec_size=sf_vec_size,
-            sf_use_ue8m0=sf_use_ue8m0,
-            is_sf_swizzled_layout=is_sf_swizzled_layout,
-            is_sf_8x4_layout=is_sf_8x4_layout,
-            enable_pdl=enable_pdl,
+            input,
+            global_scale,
+            sf_vec_size,
+            sf_use_ue8m0,
+            is_sf_swizzled_layout,
+            is_sf_8x4_layout,
+            enable_pdl,
             backend=_flashinfer_fp4_quantize_backend,
         )
 
diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py
index b21d32db7835..0ac387e3210c 100644
--- a/python/sglang/srt/utils/common.py
+++ b/python/sglang/srt/utils/common.py
@@ -1105,7 +1105,7 @@ def check_pkg_version_at_least(pkg: str, min_version: str) -> bool:
 
     Args:
         pkg: Package name (distribution name, e.g., "flashinfer-python")
-        min_version: Minimum version required (e.g., "0.6.11.post1")
+        min_version: Minimum version required (e.g., "0.6.8.post1")
 
     Returns:
         True if package is installed and version >= min_version, False otherwise
diff --git a/test/registered/moe/test_cutedsl_moe.py b/test/registered/moe/test_cutedsl_moe.py
index 52139ed3c289..3ef21a261b7b 100644
--- a/test/registered/moe/test_cutedsl_moe.py
+++ b/test/registered/moe/test_cutedsl_moe.py
@@ -899,14 +899,13 @@ def test_v1_masked_kernel_bf16_input(self):
                         masked_m.to(hidden_states.device),
                     )
 
-                    a_global_scale = input_global_scale[:1]
                     a_fp4, a_scale_interleaved = fp4_quantize(
-                        hidden_states, a_global_scale
+                        hidden_states, input_global_scale
                     )
                     a_in_dtype = dequantize_nvfp4_to_dtype(
                         a_fp4,
                         a_scale_interleaved,
-                        a_global_scale,
+                        input_global_scale,
                         dtype=hidden_states.dtype,
                         device=hidden_states.device,
                         block_size=16,
@@ -1078,12 +1077,11 @@ def test_v1_masked_kernel_rejects_v2_w13_layout(self):
                 masked_m.to(device),
             )
 
-            a_global_scale = input_global_scale[:1]
-            a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, a_global_scale)
+            a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, input_global_scale)
             a_in_dtype = dequantize_nvfp4_to_dtype(
                 a_fp4,
                 a_scale_interleaved,
-                a_global_scale,
+                input_global_scale,
                 dtype=hidden_states.dtype,
                 device=device,
                 block_size=16,
@@ -1253,12 +1251,11 @@ def test_v1_masked_kernel_fp4_input(self):
             )
 
             # PyTorch reference (same as the bf16 input test)
-            a_gs = input_gs[:1]
-            a_fp4, a_scale = fp4_quantize(hidden_states, a_gs)
+            a_fp4, a_scale = fp4_quantize(hidden_states, input_gs)
             a_deq = dequantize_nvfp4_to_dtype(
                 a_fp4,
                 a_scale,
-                a_gs,
+                input_gs,
                 dtype=torch.bfloat16,
                 device=device,
                 block_size=16,