diff --git a/docker/Dockerfile b/docker/Dockerfile index 699e978760fd..1e9665a04219 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -19,7 +19,7 @@ ARG PIP_DEFAULT_INDEX ARG UBUNTU_MIRROR ARG GITHUB_ARTIFACTORY=github.com ARG INSTALL_FLASHINFER_JIT_CACHE=0 -ARG FLASHINFER_VERSION=0.6.11.post1 +ARG FLASHINFER_VERSION=0.6.8.post1 ARG MOONCAKE_VERSION=0.3.10.post2 #if need other arg please add in MOONCAKE_COMPILE_ARG ARG MOONCAKE_COMPILE_ARG="-DUSE_HTTP=ON -DUSE_MNNVL=ON -DUSE_CUDA=ON -DWITH_EP=ON" diff --git a/python/pyproject.toml b/python/pyproject.toml index c1a902d7aacb..6e496bad2a38 100755 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -27,8 +27,8 @@ dependencies = [ "datasets", "einops", "fastapi", - "flashinfer_python==0.6.11.post1", # keep it aligned with jit-cache version in Dockerfile - "flashinfer_cubin==0.6.11.post1", + "flashinfer_python==0.6.8.post1", # keep it aligned with jit-cache version in Dockerfile + "flashinfer_cubin==0.6.8.post1", "gguf", "interegular", "llguidance>=0.7.11,<0.8.0", @@ -37,7 +37,7 @@ dependencies = [ "ninja", "easydict", # Required by remote model code (e.g. DeepSeek-OCR) loaded via trust_remote_code; validated by transformers 5.4+ check_imports "numpy", - "nvidia-cutlass-dsl==4.5.0", + "nvidia-cutlass-dsl==4.4.2", "nvidia-ml-py", "openai-harmony==0.0.4", "openai==2.6.1", @@ -53,7 +53,7 @@ dependencies = [ "pydantic", "python-multipart", "pyzmq>=25.1.2", - "quack-kernels>=0.4.1", + "quack-kernels>=0.3.0", "requests", "scipy", "sentencepiece", diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index cc35d2e3c1d9..f1788a03f00e 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -1201,7 +1201,7 @@ def _set_envs_and_config(server_args: ServerArgs): if server_args.attention_backend == "flashinfer": assert_pkg_version( "flashinfer_python", - "0.6.11.post1", + "0.6.8.post1", "Please uninstall the old version and " "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.", diff --git a/python/sglang/srt/layers/flashinfer_comm_fusion.py b/python/sglang/srt/layers/flashinfer_comm_fusion.py index c66c2cda4d8a..bca28f3e211c 100644 --- a/python/sglang/srt/layers/flashinfer_comm_fusion.py +++ b/python/sglang/srt/layers/flashinfer_comm_fusion.py @@ -383,11 +383,6 @@ def initialize( hidden_dim=hidden_dim, dtype=dtype, force_oneshot_support=bool(use_oneshot), - # Pin the symmetric-memory rendezvous to the actual - # subgroup. Without this, flashinfer >=0.6.10 falls back - # to WORLD and TP/EP/CP subgroup peers get addressed - # incorrectly (kernel hangs in cuda-graph warmup). - group=device_group, ) if ( _TorchDistBackend is not None @@ -520,6 +515,8 @@ def ensure_workspace_initialized( if not is_flashinfer_available() or _flashinfer_comm is None: return False + tp_coordinator = get_tp_group() + if use_attn_tp_group: world_size = get_attn_tensor_model_parallel_world_size() rank = get_attn_tensor_model_parallel_rank() @@ -534,12 +531,17 @@ def ensure_workspace_initialized( rank = get_moe_tensor_parallel_rank() coordinator = get_moe_tp_group() - # Always pass the coordinator's groups: flashinfer >=0.6.10 reads the - # rendezvous group from `group=...` (falling back to WORLD when None), - # so leaving it None silently rendezvouses on WORLD and the kernel ends - # up addressing the wrong peers in TP/EP/CP subgroup setups. - device_group = coordinator.device_group - cpu_group = coordinator.cpu_group + # When the sub-group IS the full TP group, pass None so the workspace + # uses the default process group directly (no TorchDistBackend needed). + # For true sub-groups, use NCCL device_group for GPU/device mapping and + # GLOO cpu_group for metadata broadcasts (avoids NCCL collectives that + # interfere with CUDA graph capture). + if coordinator.device_group is tp_coordinator.device_group: + device_group = None + cpu_group = None + else: + device_group = coordinator.device_group + cpu_group = coordinator.cpu_group if world_size <= 1: return False diff --git a/python/sglang/srt/layers/quantization/fp4_utils.py b/python/sglang/srt/layers/quantization/fp4_utils.py index a7a64f25e99e..96409750cbf7 100644 --- a/python/sglang/srt/layers/quantization/fp4_utils.py +++ b/python/sglang/srt/layers/quantization/fp4_utils.py @@ -34,13 +34,13 @@ def _flashinfer_fp4_quantize_impl( enable_pdl: Optional[bool] = None, ) -> tuple[torch.Tensor, torch.Tensor]: return _flashinfer_fp4_quantize( - input=input, - global_scale=global_scale, - sf_vec_size=sf_vec_size, - sf_use_ue8m0=sf_use_ue8m0, - is_sf_swizzled_layout=is_sf_swizzled_layout, - is_sf_8x4_layout=is_sf_8x4_layout, - enable_pdl=enable_pdl, + input, + global_scale, + sf_vec_size, + sf_use_ue8m0, + is_sf_swizzled_layout, + is_sf_8x4_layout, + enable_pdl, backend=_flashinfer_fp4_quantize_backend, ) diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py index b21d32db7835..0ac387e3210c 100644 --- a/python/sglang/srt/utils/common.py +++ b/python/sglang/srt/utils/common.py @@ -1105,7 +1105,7 @@ def check_pkg_version_at_least(pkg: str, min_version: str) -> bool: Args: pkg: Package name (distribution name, e.g., "flashinfer-python") - min_version: Minimum version required (e.g., "0.6.11.post1") + min_version: Minimum version required (e.g., "0.6.8.post1") Returns: True if package is installed and version >= min_version, False otherwise diff --git a/test/registered/moe/test_cutedsl_moe.py b/test/registered/moe/test_cutedsl_moe.py index 52139ed3c289..3ef21a261b7b 100644 --- a/test/registered/moe/test_cutedsl_moe.py +++ b/test/registered/moe/test_cutedsl_moe.py @@ -899,14 +899,13 @@ def test_v1_masked_kernel_bf16_input(self): masked_m.to(hidden_states.device), ) - a_global_scale = input_global_scale[:1] a_fp4, a_scale_interleaved = fp4_quantize( - hidden_states, a_global_scale + hidden_states, input_global_scale ) a_in_dtype = dequantize_nvfp4_to_dtype( a_fp4, a_scale_interleaved, - a_global_scale, + input_global_scale, dtype=hidden_states.dtype, device=hidden_states.device, block_size=16, @@ -1078,12 +1077,11 @@ def test_v1_masked_kernel_rejects_v2_w13_layout(self): masked_m.to(device), ) - a_global_scale = input_global_scale[:1] - a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, a_global_scale) + a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, input_global_scale) a_in_dtype = dequantize_nvfp4_to_dtype( a_fp4, a_scale_interleaved, - a_global_scale, + input_global_scale, dtype=hidden_states.dtype, device=device, block_size=16, @@ -1253,12 +1251,11 @@ def test_v1_masked_kernel_fp4_input(self): ) # PyTorch reference (same as the bf16 input test) - a_gs = input_gs[:1] - a_fp4, a_scale = fp4_quantize(hidden_states, a_gs) + a_fp4, a_scale = fp4_quantize(hidden_states, input_gs) a_deq = dequantize_nvfp4_to_dtype( a_fp4, a_scale, - a_gs, + input_gs, dtype=torch.bfloat16, device=device, block_size=16,