Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ ARG PIP_DEFAULT_INDEX
ARG UBUNTU_MIRROR
ARG GITHUB_ARTIFACTORY=github.com
ARG INSTALL_FLASHINFER_JIT_CACHE=0
ARG FLASHINFER_VERSION=0.5.3
ARG FLASHINFER_VERSION=0.6.1

ENV DEBIAN_FRONTEND=noninteractive \
CUDA_HOME=/usr/local/cuda \
Expand Down Expand Up @@ -304,7 +304,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \

# Patching packages for CUDA 12/13 compatibility
# TODO: Remove when torch version covers these packages
# TODO: Move cutlass-dsl to pyproject.toml after drivers on CI runners are updated
RUN --mount=type=cache,target=/root/.cache/pip if [ "${CUDA_VERSION%%.*}" = "12" ]; then \
python3 -m pip install nvidia-nccl-cu12==2.28.3 --force-reinstall --no-deps ; \
python3 -m pip install nvidia-cudnn-cu12==9.16.0.29 --force-reinstall --no-deps ; \
Expand Down
4 changes: 2 additions & 2 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ dependencies = [
"datasets",
"einops",
"fastapi",
"flashinfer_python==0.5.3", # keep it aligned with jit-cache version in Dockerfile
"flashinfer_cubin==0.5.3",
"flashinfer_python==0.6.1", # keep it aligned with jit-cache version in Dockerfile
"flashinfer_cubin==0.6.1",
"gguf",
"hf_transfer",
"huggingface_hub",
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/entrypoints/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,7 +800,7 @@ def _set_envs_and_config(server_args: ServerArgs):
if server_args.attention_backend == "flashinfer":
assert_pkg_version(
"flashinfer_python",
"0.5.3",
"0.6.1",
"Please uninstall the old version and "
"reinstall the latest version by following the instructions "
"at https://docs.flashinfer.ai/installation.html.",
Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/layers/moe/fused_moe_triton/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1299,7 +1299,6 @@ def forward_impl(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
local_expert_offset=self.moe_ep_rank * self.num_local_experts,
local_num_experts=self.num_local_experts,
routed_scaling_factor=self.moe_runner_config.routed_scaling_factor,
tile_tokens_dim=None,
# Respect the routing method configured for this layer (e.g., Renormalize for Qwen3),
# instead of always assuming DeepSeekV3.
routing_method_type=(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,6 @@ def fused_experts_none_to_flashinfer_trtllm_fp8(
if runner_config.routed_scaling_factor is not None
else 1.0
),
tile_tokens_dim=None,
routing_method_type=routing_method_type,
use_shuffled_weight=False,
tune_max_num_tokens=next_power_of_2(a_q.shape[0]),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,6 @@ def apply_with_router_logits(
local_expert_offset=layer.moe_ep_rank * layer.num_local_experts,
local_num_experts=layer.num_local_experts,
routed_scaling_factor=routed_scaling_factor,
tile_tokens_dim=None,
routing_method_type=layer.routing_method_type,
do_finalize=True,
tune_max_num_tokens=next_power_of_2(hs_fp4.shape[0]),
Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/layers/quantization/modelopt_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,6 @@ def apply(
else 1.0
),
use_routing_scales_on_input=use_routing_scales_on_input,
tile_tokens_dim=None,
routing_method_type=routing_method_type,
tune_max_num_tokens=next_power_of_2(x.shape[0]),
)
Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/layers/quantization/mxfp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,6 @@ def apply(
layer.moe_ep_rank * layer.num_local_experts, # local_expert_offset
layer.num_local_experts, # local num experts
None,
None, # tile_tokens_dim
1, # routing_method_type, renormalize
True, # do finalize
tune_max_num_tokens=next_power_of_2(x_quant.shape[0]),
Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2743,6 +2743,7 @@ def is_fa3_default_architecture(hf_config):
"Olmo2ForCausalLM",
"Gemma2ForCausalLM",
"Gemma3ForConditionalGeneration",
"MixtralForCausalLM",
"Qwen2ForCausalLM",
"Qwen3ForCausalLM",
"Qwen3MoeForCausalLM",
Expand Down
2 changes: 1 addition & 1 deletion scripts/ci/ci_install_dependency.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set -euxo pipefail
# Set up environment variables
IS_BLACKWELL=${IS_BLACKWELL:-0}
CU_VERSION="cu129"
FLASHINFER_VERSION=0.5.3
FLASHINFER_VERSION=0.6.1
OPTIONAL_DEPS="${1:-}"

# Detect system architecture
Expand Down
Loading