Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ ARG PIP_DEFAULT_INDEX
ARG UBUNTU_MIRROR
ARG GITHUB_ARTIFACTORY=github.com
ARG INSTALL_FLASHINFER_JIT_CACHE=0
ARG FLASHINFER_VERSION=0.6.11.post1
ARG FLASHINFER_VERSION=0.6.8.post1
ARG MOONCAKE_VERSION=0.3.10.post2
#if need other arg please add in MOONCAKE_COMPILE_ARG
ARG MOONCAKE_COMPILE_ARG="-DUSE_HTTP=ON -DUSE_MNNVL=ON -DUSE_CUDA=ON -DWITH_EP=ON"
Expand Down
8 changes: 4 additions & 4 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ dependencies = [
"datasets",
"einops",
"fastapi",
"flashinfer_python==0.6.11.post1", # keep it aligned with jit-cache version in Dockerfile
"flashinfer_cubin==0.6.11.post1",
"flashinfer_python==0.6.8.post1", # keep it aligned with jit-cache version in Dockerfile
"flashinfer_cubin==0.6.8.post1",
"gguf",
"interegular",
"llguidance>=0.7.11,<0.8.0",
Expand All @@ -37,7 +37,7 @@ dependencies = [
"ninja",
"easydict", # Required by remote model code (e.g. DeepSeek-OCR) loaded via trust_remote_code; validated by transformers 5.4+ check_imports
"numpy",
"nvidia-cutlass-dsl==4.5.0",
"nvidia-cutlass-dsl==4.4.2",
"nvidia-ml-py",
"openai-harmony==0.0.4",
"openai==2.6.1",
Expand All @@ -53,7 +53,7 @@ dependencies = [
"pydantic",
"python-multipart",
"pyzmq>=25.1.2",
"quack-kernels>=0.4.1",
"quack-kernels>=0.3.0",
"requests",
"scipy",
"sentencepiece",
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/entrypoints/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,7 +1201,7 @@ def _set_envs_and_config(server_args: ServerArgs):
if server_args.attention_backend == "flashinfer":
assert_pkg_version(
"flashinfer_python",
"0.6.11.post1",
"0.6.8.post1",
"Please uninstall the old version and "
"reinstall the latest version by following the instructions "
"at https://docs.flashinfer.ai/installation.html.",
Expand Down
24 changes: 13 additions & 11 deletions python/sglang/srt/layers/flashinfer_comm_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,11 +383,6 @@ def initialize(
hidden_dim=hidden_dim,
dtype=dtype,
force_oneshot_support=bool(use_oneshot),
# Pin the symmetric-memory rendezvous to the actual
# subgroup. Without this, flashinfer >=0.6.10 falls back
# to WORLD and TP/EP/CP subgroup peers get addressed
# incorrectly (kernel hangs in cuda-graph warmup).
group=device_group,
)
if (
_TorchDistBackend is not None
Expand Down Expand Up @@ -520,6 +515,8 @@ def ensure_workspace_initialized(
if not is_flashinfer_available() or _flashinfer_comm is None:
return False

tp_coordinator = get_tp_group()

if use_attn_tp_group:
world_size = get_attn_tensor_model_parallel_world_size()
rank = get_attn_tensor_model_parallel_rank()
Expand All @@ -534,12 +531,17 @@ def ensure_workspace_initialized(
rank = get_moe_tensor_parallel_rank()
coordinator = get_moe_tp_group()

# Always pass the coordinator's groups: flashinfer >=0.6.10 reads the
# rendezvous group from `group=...` (falling back to WORLD when None),
# so leaving it None silently rendezvouses on WORLD and the kernel ends
# up addressing the wrong peers in TP/EP/CP subgroup setups.
device_group = coordinator.device_group
cpu_group = coordinator.cpu_group
# When the sub-group IS the full TP group, pass None so the workspace
# uses the default process group directly (no TorchDistBackend needed).
# For true sub-groups, use NCCL device_group for GPU/device mapping and
# GLOO cpu_group for metadata broadcasts (avoids NCCL collectives that
# interfere with CUDA graph capture).
if coordinator.device_group is tp_coordinator.device_group:
device_group = None
cpu_group = None
else:
device_group = coordinator.device_group
cpu_group = coordinator.cpu_group

if world_size <= 1:
return False
Expand Down
14 changes: 7 additions & 7 deletions python/sglang/srt/layers/quantization/fp4_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,13 @@ def _flashinfer_fp4_quantize_impl(
enable_pdl: Optional[bool] = None,
) -> tuple[torch.Tensor, torch.Tensor]:
return _flashinfer_fp4_quantize(
input=input,
global_scale=global_scale,
sf_vec_size=sf_vec_size,
sf_use_ue8m0=sf_use_ue8m0,
is_sf_swizzled_layout=is_sf_swizzled_layout,
is_sf_8x4_layout=is_sf_8x4_layout,
enable_pdl=enable_pdl,
input,
global_scale,
sf_vec_size,
sf_use_ue8m0,
is_sf_swizzled_layout,
is_sf_8x4_layout,
enable_pdl,
backend=_flashinfer_fp4_quantize_backend,
)

Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1105,7 +1105,7 @@ def check_pkg_version_at_least(pkg: str, min_version: str) -> bool:
Args:
pkg: Package name (distribution name, e.g., "flashinfer-python")
min_version: Minimum version required (e.g., "0.6.11.post1")
min_version: Minimum version required (e.g., "0.6.8.post1")
Returns:
True if package is installed and version >= min_version, False otherwise
Expand Down
15 changes: 6 additions & 9 deletions test/registered/moe/test_cutedsl_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -899,14 +899,13 @@ def test_v1_masked_kernel_bf16_input(self):
masked_m.to(hidden_states.device),
)

a_global_scale = input_global_scale[:1]
a_fp4, a_scale_interleaved = fp4_quantize(
hidden_states, a_global_scale
hidden_states, input_global_scale
)
a_in_dtype = dequantize_nvfp4_to_dtype(
a_fp4,
a_scale_interleaved,
a_global_scale,
input_global_scale,
dtype=hidden_states.dtype,
device=hidden_states.device,
block_size=16,
Expand Down Expand Up @@ -1078,12 +1077,11 @@ def test_v1_masked_kernel_rejects_v2_w13_layout(self):
masked_m.to(device),
)

a_global_scale = input_global_scale[:1]
a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, a_global_scale)
a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, input_global_scale)
a_in_dtype = dequantize_nvfp4_to_dtype(
a_fp4,
a_scale_interleaved,
a_global_scale,
input_global_scale,
dtype=hidden_states.dtype,
device=device,
block_size=16,
Expand Down Expand Up @@ -1253,12 +1251,11 @@ def test_v1_masked_kernel_fp4_input(self):
)

# PyTorch reference (same as the bf16 input test)
a_gs = input_gs[:1]
a_fp4, a_scale = fp4_quantize(hidden_states, a_gs)
a_fp4, a_scale = fp4_quantize(hidden_states, input_gs)
a_deq = dequantize_nvfp4_to_dtype(
a_fp4,
a_scale,
a_gs,
input_gs,
dtype=torch.bfloat16,
device=device,
block_size=16,
Expand Down
Loading