sgl-project · byjiang1996 · Dec 7, 2025 · Dec 7, 2025 · Dec 7, 2025 · Dec 7, 2025
@@ -265,6 +265,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     fi && \
     TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" MAX_JOBS=${BUILD_AND_DOWNLOAD_PARALLEL} pip install --no-build-isolation .
 
+# Install flash_attn
+# flash_attn is required by transformer 5.x in case if not yet installed
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -c "import flash_attn" 2>/dev/null \
+    || python3 -m pip install flash_attn --no-build-isolation
+
 # Install essential Python packages
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install \

@@ -65,11 +65,11 @@ dependencies = [
   "torch_memory_saver==0.0.9",
   "torch==2.9.1",
   "torchaudio==2.9.1",
-  "torchcodec==0.7.0 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')", # torchcodec does not exist in those systems. If not provided, transformer will use torchvision instead by default.
+  "torchcodec==0.8.0 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')", # torchcodec does not exist in those systems. If not provided, transformer will use torchvision instead by default.
   "torchvision",
   "torchao==0.9.0",
   "tqdm",
-  "transformers==4.57.1",
+  "transformers==5.0.0rc0",
   "uvicorn",
   "uvloop",
   "xgrammar==0.1.27",

diff --git a/python/pyproject_cpu.toml b/python/pyproject_cpu.toml
@@ -59,7 +59,7 @@ dependencies = [
   "timm==1.0.16",
   "torchao==0.9.0",
   "tqdm",
-  "transformers==4.57.1",
+  "transformers==5.0.0rc0",
   "uvicorn",
   "uvloop",
   "xgrammar==0.1.27",

diff --git a/python/pyproject_other.toml b/python/pyproject_other.toml
@@ -59,7 +59,7 @@ runtime_common = [
   "timm==1.0.16",
   "torchao==0.9.0",
   "tqdm",
-  "transformers==4.57.1",
+  "transformers==5.0.0rc0",
   "uvicorn",
   "uvloop",
   "xgrammar==0.1.27",

diff --git a/python/pyproject_xpu.toml b/python/pyproject_xpu.toml
@@ -63,7 +63,7 @@ dependencies = [
   "timm==1.0.16",
   "torchao==0.9.0",
   "tqdm",
-  "transformers==4.57.1",
+  "transformers==5.0.0rc0",
   "uvicorn",
   "uvloop",
   # "xgrammar==0.1.24", , xgrammar depends on CUDA PyTorch and Triton only

diff --git a/python/sglang/srt/configs/qwen3_omni.py b/python/sglang/srt/configs/qwen3_omni.py
@@ -1,6 +1,5 @@
 from transformers import PretrainedConfig
 from transformers.configuration_utils import layer_type_validation
-from transformers.modeling_rope_utils import rope_config_validation
 
 from sglang.utils import logger
 
@@ -168,7 +167,6 @@ def __init__(
         # BC: if there is a 'type' field, move it to 'rope_type'.
         if self.rope_scaling is not None and "type" in self.rope_scaling:
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
-        rope_config_validation(self)
 
         # MoE arguments
         self.decoder_sparse_step = decoder_sparse_step
@@ -311,7 +309,6 @@ def __init__(
         # BC: if there is a 'type' field, move it to 'rope_type'.
         if self.rope_scaling is not None and "type" in self.rope_scaling:
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
-        rope_config_validation(self)
 
         self.layer_types = layer_types
         if self.layer_types is None:
@@ -405,7 +402,6 @@ def __init__(
         # BC: if there is a 'type' field, move it to 'rope_type'.
         if self.rope_scaling is not None and "type" in self.rope_scaling:
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
-        rope_config_validation(self)
 
         # MoE arguments
         self.decoder_sparse_step = decoder_sparse_step

diff --git a/python/sglang/srt/configs/qwen3_vl.py b/python/sglang/srt/configs/qwen3_vl.py
@@ -1,5 +1,6 @@
 from transformers import PretrainedConfig
-from transformers.modeling_rope_utils import rope_config_validation
+
+# from transformers.modeling_rope_utils import rope_config_validation
 
 
 class Qwen3VLVisionConfig(PretrainedConfig):
@@ -187,7 +188,7 @@ def __init__(
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
 
-        rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
+        # rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
 
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
 
@@ -450,7 +451,7 @@ def __init__(
         self.rope_scaling = rope_scaling
         self.head_dim = head_dim or hidden_size // num_attention_heads
 
-        rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
+        # rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
 
         # MoE arguments
         self.decoder_sparse_step = decoder_sparse_step

diff --git a/scripts/ci/ci_install_dependency.sh b/scripts/ci/ci_install_dependency.sh
@@ -79,6 +79,7 @@ if [ "$IS_BLACKWELL" = "1" ]; then
 
     # Clean up existing installations
     $PIP_CMD uninstall -y sgl-kernel sglang $PIP_INSTALL_SUFFIX || true
+    $PIP_CMD uninstall -y flash_attn || true
     $PIP_CMD uninstall -y flashinfer-python flashinfer-cubin flashinfer-jit-cache $PIP_INSTALL_SUFFIX || true
 else
     # In normal cases, we use uv, which is much faster than pip.
@@ -91,9 +92,16 @@ else
 
     # Clean up existing installations
     $PIP_CMD uninstall sgl-kernel sglang || true
+    $PIP_CMD uninstall flash_attn || true
     $PIP_CMD uninstall flashinfer-python flashinfer-cubin flashinfer-jit-cache || true
 fi
 
+if [ "$IS_BLACKWELL" != "1" ]; then
+    # For lmms_evals evaluating MMMU
+    git clone --branch v0.5 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+    $PIP_CMD install -e lmms-eval/ $PIP_INSTALL_SUFFIX
+fi
+
 EXTRAS="dev"
 if [ -n "$OPTIONAL_DEPS" ]; then
     EXTRAS="dev,${OPTIONAL_DEPS}"
@@ -129,17 +137,12 @@ $PIP_CMD list
 
 $PIP_CMD install mooncake-transfer-engine==0.3.7.post2 "${NVRTC_SPEC}" py-spy scipy huggingface_hub[hf_xet] pytest $PIP_INSTALL_SUFFIX
 
-if [ "$IS_BLACKWELL" != "1" ]; then
-    # For lmms_evals evaluating MMMU
-    git clone --branch v0.5 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
-    $PIP_CMD install -e lmms-eval/ $PIP_INSTALL_SUFFIX
-fi
-
 # DeepEP depends on nvshmem 3.4.5
 $PIP_CMD install nvidia-nvshmem-cu12==3.4.5 --force-reinstall $PIP_INSTALL_SUFFIX
 
 # Cudnn with version less than 9.16.0.29 will cause performance regression on Conv3D kernel
 $PIP_CMD install nvidia-cudnn-cu12==9.16.0.29 --force-reinstall $PIP_INSTALL_SUFFIX
+
 $PIP_CMD uninstall xformers || true
 # Show current packages
 $PIP_CMD list