diff --git a/docker/Dockerfile b/docker/Dockerfile index e9fc59619b2f..37c78736dd1e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -265,6 +265,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \ fi && \ TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" MAX_JOBS=${BUILD_AND_DOWNLOAD_PARALLEL} pip install --no-build-isolation . +# Install flash_attn +# flash_attn is required by transformer 5.x in case if not yet installed +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -c "import flash_attn" 2>/dev/null \ + || python3 -m pip install flash_attn --no-build-isolation + # Install essential Python packages RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install \ diff --git a/python/pyproject.toml b/python/pyproject.toml index 85428cba02d8..55d11350adc2 100755 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -65,11 +65,11 @@ dependencies = [ "torch_memory_saver==0.0.9", "torch==2.9.1", "torchaudio==2.9.1", - "torchcodec==0.7.0 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')", # torchcodec does not exist in those systems. If not provided, transformer will use torchvision instead by default. + "torchcodec==0.8.0 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')", # torchcodec does not exist in those systems. If not provided, transformer will use torchvision instead by default. "torchvision", "torchao==0.9.0", "tqdm", - "transformers==4.57.1", + "transformers==5.0.0rc0", "uvicorn", "uvloop", "xgrammar==0.1.27", diff --git a/python/pyproject_cpu.toml b/python/pyproject_cpu.toml index a537bc3607ef..5421f2b7cfbf 100644 --- a/python/pyproject_cpu.toml +++ b/python/pyproject_cpu.toml @@ -59,7 +59,7 @@ dependencies = [ "timm==1.0.16", "torchao==0.9.0", "tqdm", - "transformers==4.57.1", + "transformers==5.0.0rc0", "uvicorn", "uvloop", "xgrammar==0.1.27", diff --git a/python/pyproject_other.toml b/python/pyproject_other.toml index 3e854caa726a..ae1622363c41 100755 --- a/python/pyproject_other.toml +++ b/python/pyproject_other.toml @@ -59,7 +59,7 @@ runtime_common = [ "timm==1.0.16", "torchao==0.9.0", "tqdm", - "transformers==4.57.1", + "transformers==5.0.0rc0", "uvicorn", "uvloop", "xgrammar==0.1.27", diff --git a/python/pyproject_xpu.toml b/python/pyproject_xpu.toml index a058e6f48a82..3d7dc969a4b5 100644 --- a/python/pyproject_xpu.toml +++ b/python/pyproject_xpu.toml @@ -63,7 +63,7 @@ dependencies = [ "timm==1.0.16", "torchao==0.9.0", "tqdm", - "transformers==4.57.1", + "transformers==5.0.0rc0", "uvicorn", "uvloop", # "xgrammar==0.1.24", , xgrammar depends on CUDA PyTorch and Triton only diff --git a/python/sglang/srt/configs/qwen3_omni.py b/python/sglang/srt/configs/qwen3_omni.py index d42e98a9a07b..8baea892335d 100644 --- a/python/sglang/srt/configs/qwen3_omni.py +++ b/python/sglang/srt/configs/qwen3_omni.py @@ -1,6 +1,5 @@ from transformers import PretrainedConfig from transformers.configuration_utils import layer_type_validation -from transformers.modeling_rope_utils import rope_config_validation from sglang.utils import logger @@ -168,7 +167,6 @@ def __init__( # BC: if there is a 'type' field, move it to 'rope_type'. if self.rope_scaling is not None and "type" in self.rope_scaling: self.rope_scaling["rope_type"] = self.rope_scaling["type"] - rope_config_validation(self) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -311,7 +309,6 @@ def __init__( # BC: if there is a 'type' field, move it to 'rope_type'. if self.rope_scaling is not None and "type" in self.rope_scaling: self.rope_scaling["rope_type"] = self.rope_scaling["type"] - rope_config_validation(self) self.layer_types = layer_types if self.layer_types is None: @@ -405,7 +402,6 @@ def __init__( # BC: if there is a 'type' field, move it to 'rope_type'. if self.rope_scaling is not None and "type" in self.rope_scaling: self.rope_scaling["rope_type"] = self.rope_scaling["type"] - rope_config_validation(self) # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/python/sglang/srt/configs/qwen3_vl.py b/python/sglang/srt/configs/qwen3_vl.py index a758d1f4e45e..5a0eaf75ff36 100644 --- a/python/sglang/srt/configs/qwen3_vl.py +++ b/python/sglang/srt/configs/qwen3_vl.py @@ -1,5 +1,6 @@ from transformers import PretrainedConfig -from transformers.modeling_rope_utils import rope_config_validation + +# from transformers.modeling_rope_utils import rope_config_validation class Qwen3VLVisionConfig(PretrainedConfig): @@ -187,7 +188,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + # rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -450,7 +451,7 @@ def __init__( self.rope_scaling = rope_scaling self.head_dim = head_dim or hidden_size // num_attention_heads - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + # rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/scripts/ci/ci_install_dependency.sh b/scripts/ci/ci_install_dependency.sh index 7d0735602452..b97adca46e2f 100755 --- a/scripts/ci/ci_install_dependency.sh +++ b/scripts/ci/ci_install_dependency.sh @@ -79,6 +79,7 @@ if [ "$IS_BLACKWELL" = "1" ]; then # Clean up existing installations $PIP_CMD uninstall -y sgl-kernel sglang $PIP_INSTALL_SUFFIX || true + $PIP_CMD uninstall -y flash_attn || true $PIP_CMD uninstall -y flashinfer-python flashinfer-cubin flashinfer-jit-cache $PIP_INSTALL_SUFFIX || true else # In normal cases, we use uv, which is much faster than pip. @@ -91,9 +92,16 @@ else # Clean up existing installations $PIP_CMD uninstall sgl-kernel sglang || true + $PIP_CMD uninstall flash_attn || true $PIP_CMD uninstall flashinfer-python flashinfer-cubin flashinfer-jit-cache || true fi +if [ "$IS_BLACKWELL" != "1" ]; then + # For lmms_evals evaluating MMMU + git clone --branch v0.5 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git + $PIP_CMD install -e lmms-eval/ $PIP_INSTALL_SUFFIX +fi + EXTRAS="dev" if [ -n "$OPTIONAL_DEPS" ]; then EXTRAS="dev,${OPTIONAL_DEPS}" @@ -129,17 +137,12 @@ $PIP_CMD list $PIP_CMD install mooncake-transfer-engine==0.3.7.post2 "${NVRTC_SPEC}" py-spy scipy huggingface_hub[hf_xet] pytest $PIP_INSTALL_SUFFIX -if [ "$IS_BLACKWELL" != "1" ]; then - # For lmms_evals evaluating MMMU - git clone --branch v0.5 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git - $PIP_CMD install -e lmms-eval/ $PIP_INSTALL_SUFFIX -fi - # DeepEP depends on nvshmem 3.4.5 $PIP_CMD install nvidia-nvshmem-cu12==3.4.5 --force-reinstall $PIP_INSTALL_SUFFIX # Cudnn with version less than 9.16.0.29 will cause performance regression on Conv3D kernel $PIP_CMD install nvidia-cudnn-cu12==9.16.0.29 --force-reinstall $PIP_INSTALL_SUFFIX + $PIP_CMD uninstall xformers || true # Show current packages $PIP_CMD list