diff --git a/docker/Dockerfile b/docker/Dockerfile index 2e57ed442e20..ae251746bb9d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -20,7 +20,7 @@ ARG PIP_DEFAULT_INDEX ARG UBUNTU_MIRROR ARG GITHUB_ARTIFACTORY=github.com ARG INSTALL_FLASHINFER_JIT_CACHE=0 -ARG FLASHINFER_VERSION=0.6.8.post1 +ARG FLASHINFER_VERSION=0.6.11 ARG MOONCAKE_VERSION=0.3.10.post2 #if need other arg please add in MOONCAKE_COMPILE_ARG ARG MOONCAKE_COMPILE_ARG="-DUSE_HTTP=ON -DUSE_MNNVL=ON -DUSE_CUDA=ON -DWITH_EP=ON" diff --git a/python/pyproject.toml b/python/pyproject.toml index 477b2fe8e1f3..59cbd8e45aa8 100755 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -27,8 +27,8 @@ dependencies = [ "datasets", "einops", "fastapi", - "flashinfer_python==0.6.8.post1", # keep it aligned with jit-cache version in Dockerfile - "flashinfer_cubin==0.6.8.post1", + "flashinfer_python==0.6.11", # keep it aligned with jit-cache version in Dockerfile + "flashinfer_cubin==0.6.11", "gguf", "interegular", "llguidance>=0.7.11,<0.8.0", @@ -37,7 +37,7 @@ dependencies = [ "ninja", "easydict", # Required by remote model code (e.g. DeepSeek-OCR) loaded via trust_remote_code; validated by transformers 5.4+ check_imports "numpy", - "nvidia-cutlass-dsl==4.4.2", + "nvidia-cutlass-dsl==4.5.0", "nvidia-ml-py", "openai-harmony==0.0.4", "openai==2.6.1", @@ -53,7 +53,7 @@ dependencies = [ "pydantic", "python-multipart", "pyzmq>=25.1.2", - "quack-kernels>=0.3.0", + "quack-kernels>=0.4.1", "requests", "scipy", "sentencepiece", diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index a94f9d7ed26d..6ce2867acd57 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -1200,7 +1200,7 @@ def _set_envs_and_config(server_args: ServerArgs): if server_args.attention_backend == "flashinfer": assert_pkg_version( "flashinfer_python", - "0.6.8.post1", + "0.6.11", "Please uninstall the old version and " "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.", diff --git a/python/sglang/srt/layers/flashinfer_comm_fusion.py b/python/sglang/srt/layers/flashinfer_comm_fusion.py index bca28f3e211c..c66c2cda4d8a 100644 --- a/python/sglang/srt/layers/flashinfer_comm_fusion.py +++ b/python/sglang/srt/layers/flashinfer_comm_fusion.py @@ -383,6 +383,11 @@ def initialize( hidden_dim=hidden_dim, dtype=dtype, force_oneshot_support=bool(use_oneshot), + # Pin the symmetric-memory rendezvous to the actual + # subgroup. Without this, flashinfer >=0.6.10 falls back + # to WORLD and TP/EP/CP subgroup peers get addressed + # incorrectly (kernel hangs in cuda-graph warmup). + group=device_group, ) if ( _TorchDistBackend is not None @@ -515,8 +520,6 @@ def ensure_workspace_initialized( if not is_flashinfer_available() or _flashinfer_comm is None: return False - tp_coordinator = get_tp_group() - if use_attn_tp_group: world_size = get_attn_tensor_model_parallel_world_size() rank = get_attn_tensor_model_parallel_rank() @@ -531,17 +534,12 @@ def ensure_workspace_initialized( rank = get_moe_tensor_parallel_rank() coordinator = get_moe_tp_group() - # When the sub-group IS the full TP group, pass None so the workspace - # uses the default process group directly (no TorchDistBackend needed). - # For true sub-groups, use NCCL device_group for GPU/device mapping and - # GLOO cpu_group for metadata broadcasts (avoids NCCL collectives that - # interfere with CUDA graph capture). - if coordinator.device_group is tp_coordinator.device_group: - device_group = None - cpu_group = None - else: - device_group = coordinator.device_group - cpu_group = coordinator.cpu_group + # Always pass the coordinator's groups: flashinfer >=0.6.10 reads the + # rendezvous group from `group=...` (falling back to WORLD when None), + # so leaving it None silently rendezvouses on WORLD and the kernel ends + # up addressing the wrong peers in TP/EP/CP subgroup setups. + device_group = coordinator.device_group + cpu_group = coordinator.cpu_group if world_size <= 1: return False diff --git a/python/sglang/srt/layers/quantization/fp4_utils.py b/python/sglang/srt/layers/quantization/fp4_utils.py index 96409750cbf7..a7a64f25e99e 100644 --- a/python/sglang/srt/layers/quantization/fp4_utils.py +++ b/python/sglang/srt/layers/quantization/fp4_utils.py @@ -34,13 +34,13 @@ def _flashinfer_fp4_quantize_impl( enable_pdl: Optional[bool] = None, ) -> tuple[torch.Tensor, torch.Tensor]: return _flashinfer_fp4_quantize( - input, - global_scale, - sf_vec_size, - sf_use_ue8m0, - is_sf_swizzled_layout, - is_sf_8x4_layout, - enable_pdl, + input=input, + global_scale=global_scale, + sf_vec_size=sf_vec_size, + sf_use_ue8m0=sf_use_ue8m0, + is_sf_swizzled_layout=is_sf_swizzled_layout, + is_sf_8x4_layout=is_sf_8x4_layout, + enable_pdl=enable_pdl, backend=_flashinfer_fp4_quantize_backend, ) diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py index c91fcdcfbebf..09d86aae35d7 100644 --- a/python/sglang/srt/utils/common.py +++ b/python/sglang/srt/utils/common.py @@ -1077,7 +1077,7 @@ def check_pkg_version_at_least(pkg: str, min_version: str) -> bool: Args: pkg: Package name (distribution name, e.g., "flashinfer-python") - min_version: Minimum version required (e.g., "0.6.8.post1") + min_version: Minimum version required (e.g., "0.6.10.post1") Returns: True if package is installed and version >= min_version, False otherwise diff --git a/test/registered/moe/test_cutedsl_moe.py b/test/registered/moe/test_cutedsl_moe.py index fcc0ff0e2911..b29a48e3d9e8 100644 --- a/test/registered/moe/test_cutedsl_moe.py +++ b/test/registered/moe/test_cutedsl_moe.py @@ -899,13 +899,14 @@ def test_v1_masked_kernel_bf16_input(self): masked_m.to(hidden_states.device), ) + a_global_scale = input_global_scale[:1] a_fp4, a_scale_interleaved = fp4_quantize( - hidden_states, input_global_scale + hidden_states, a_global_scale ) a_in_dtype = dequantize_nvfp4_to_dtype( a_fp4, a_scale_interleaved, - input_global_scale, + a_global_scale, dtype=hidden_states.dtype, device=hidden_states.device, block_size=16, @@ -1077,11 +1078,12 @@ def test_v1_masked_kernel_rejects_v2_w13_layout(self): masked_m.to(device), ) - a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, input_global_scale) + a_global_scale = input_global_scale[:1] + a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, a_global_scale) a_in_dtype = dequantize_nvfp4_to_dtype( a_fp4, a_scale_interleaved, - input_global_scale, + a_global_scale, dtype=hidden_states.dtype, device=device, block_size=16, @@ -1251,11 +1253,12 @@ def test_v1_masked_kernel_fp4_input(self): ) # PyTorch reference (same as the bf16 input test) - a_fp4, a_scale = fp4_quantize(hidden_states, input_gs) + a_gs = input_gs[:1] + a_fp4, a_scale = fp4_quantize(hidden_states, a_gs) a_deq = dequantize_nvfp4_to_dtype( a_fp4, a_scale, - input_gs, + a_gs, dtype=torch.bfloat16, device=device, block_size=16,