diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index db75ad3083b2..27ec0068668f 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
 docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
+docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 -e VLLM_CPU_ATTN_SPLIT_KV=0 --shm-size=4g "$IMAGE_NAME" \
         timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
index 10b038d8b8a8..ed782c061fa3 100644
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -4,7 +4,6 @@ depends_on:
 steps:
 - label: Basic Models Tests (Initialization)
   timeout_in_minutes: 45
-  device: h200_18gb
   torch_nightly: true
   source_file_dependencies:
   - vllm/
@@ -73,3 +72,18 @@ steps:
     - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
+
+- label: Transformers Backward Compatibility Models Test
+  working_dir: "/vllm-workspace/"
+  optional: true
+  soft_fail: true
+  commands:
+    - pip install transformers==4.57.5
+    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_transformers.py
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 12942b5c807b..3081d7ef1388 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -642,7 +642,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     else \
         BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
     fi; \
-    uv pip install --system accelerate hf_transfer modelscope \
+    uv pip install --system accelerate modelscope \
         "bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}"
 
 # ============================================================
@@ -756,9 +756,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -e tests/vllm_test_utils
 
 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER 1
+ENV HF_XET_HIGH_PERFORMANCE 1
+
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60
 
 # Copy in the v1 package for testing (it isn't distributed yet)
 COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 0600f7da82f9..77b449625dd9 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -197,6 +197,12 @@ ADD ./.buildkite/ ./.buildkite/
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install -e tests/vllm_test_utils
 
+# enable fast downloads from hf (for testing)
+ENV HF_XET_HIGH_PERFORMANCE 1
+
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60
+
 ######################### RELEASE IMAGE #########################
 FROM base AS vllm-openai
 
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index 39e1cc187592..0733509a0eb9 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -272,9 +272,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -e tests/vllm_test_utils
 
 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER 1
+ENV HF_XET_HIGH_PERFORMANCE 1
+
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/test/nightly-torch.txt
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 801847d4999d..fa7a5846edcb 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -365,9 +365,10 @@ RUN cd /vllm-workspace \
     && python3 -m pip install pytest-shard
 
 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER=1
+ENV HF_XET_HIGH_PERFORMANCE=1
+
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60
 
 # install audio decode package `torchcodec` from source (required due to 
 # ROCm and torch version mismatch) for tests with datasets package
diff --git a/docs/getting_started/installation/gpu.rocm.inc.md b/docs/getting_started/installation/gpu.rocm.inc.md
index 4ab01ee8c687..f8385997eea3 100644
--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@@ -240,7 +240,7 @@ uv pip install vllm==${VLLM_VERSION} \
         # Install dependencies
         pip install --upgrade numba \
             scipy \
-            huggingface-hub[cli,hf_transfer] \
+            huggingface-hub[cli] \
             setuptools_scm
         pip install -r requirements/rocm.txt
 
diff --git a/requirements/common.txt b/requirements/common.txt
index b610fd678687..299ec734ff34 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,7 +7,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.56.0, < 5
+transformers >= 4.56.0, != 5.0.*, != 5.1.*, != 5.2.*, != 5.3.*, != 5.4.*, != 5.5.0
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.14.0.1 # required for compressed-tensors
+compressed-tensors == 0.15.0.1 # required for compressed-tensors
 depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
diff --git a/requirements/test/cuda.in b/requirements/test/cuda.in
index 378ecf94222e..5cf3a69e1fbf 100644
--- a/requirements/test/cuda.in
+++ b/requirements/test/cuda.in
@@ -18,7 +18,7 @@ httpx
 librosa # required for audio tests
 vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
-peft>=0.15.0 # required for phi-4-mm test
+peft>=0.18.1 # required for phi-4-mm test
 pqdm
 ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
 resampy # required for audio tests
@@ -39,8 +39,8 @@ opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.5
-tokenizers==0.22.0
+transformers==5.5.3
+tokenizers==0.22.2
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes==0.49.2
diff --git a/requirements/test/cuda.txt b/requirements/test/cuda.txt
index 548ca9310ff8..ed67685e6ebd 100644
--- a/requirements/test/cuda.txt
+++ b/requirements/test/cuda.txt
@@ -4,7 +4,7 @@ absl-py==2.1.0
     # via
     #   rouge-score
     #   tensorboard
-accelerate==1.0.1
+accelerate==1.13.0
     # via peft
 aenum==3.1.16
     # via lightly
@@ -248,7 +248,6 @@ filelock==3.16.1
     #   huggingface-hub
     #   ray
     #   torch
-    #   transformers
     #   virtualenv
 fiona==1.10.1
     # via torchgeo
@@ -331,7 +330,7 @@ h5py==3.13.0
     # via terratorch
 harfile==0.3.0
     # via schemathesis
-hf-xet==1.1.7
+hf-xet==1.4.3
     # via huggingface-hub
 hiredis==3.0.0
     # via tensorizer
@@ -345,9 +344,10 @@ httpx==0.27.2
     # via
     #   -r requirements/test/cuda.in
     #   diffusers
+    #   huggingface-hub
     #   perceptron
     #   schemathesis
-huggingface-hub==0.36.2
+huggingface-hub==1.10.2
     # via
     #   accelerate
     #   datasets
@@ -756,7 +756,7 @@ pathvalidate==3.2.1
     # via pytablewriter
 patsy==1.0.1
     # via statsmodels
-peft==0.16.0
+peft==0.18.1
     # via -r requirements/test/cuda.in
 perceptron==0.1.4
     # via -r requirements/test/cuda.in
@@ -982,7 +982,7 @@ referencing==0.35.1
     # via
     #   jsonschema
     #   jsonschema-specifications
-regex==2024.9.11
+regex==2026.2.28
     # via
     #   diffusers
     #   nltk
@@ -1002,7 +1002,6 @@ requests==2.32.3
     #   google-api-core
     #   google-cloud-storage
     #   gpt-oss
-    #   huggingface-hub
     #   lightly
     #   lm-eval
     #   mistral-common
@@ -1015,7 +1014,6 @@ requests==2.32.3
     #   starlette-testclient
     #   tacoreader
     #   tiktoken
-    #   transformers
     #   wandb
 resampy==0.4.3
     # via -r requirements/test/cuda.in
@@ -1216,7 +1214,7 @@ timm==1.0.17
     #   segmentation-models-pytorch
     #   terratorch
     #   torchgeo
-tokenizers==0.22.0
+tokenizers==0.22.2
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/cuda.in
@@ -1295,7 +1293,7 @@ tqdm==4.67.3
     #   tacoreader
     #   terratorch
     #   transformers
-transformers==4.57.5
+transformers==5.5.3
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/cuda.in
@@ -1317,7 +1315,9 @@ typepy==1.3.2
 typer==0.15.2
     # via
     #   fastsafetensors
+    #   huggingface-hub
     #   perceptron
+    #   transformers
 types-python-dateutil==2.9.0.20241206
     # via arrow
 typeshed-client==2.8.2
diff --git a/requirements/test/nightly-torch.txt b/requirements/test/nightly-torch.txt
index e0eb7e114116..420fb496a718 100644
--- a/requirements/test/nightly-torch.txt
+++ b/requirements/test/nightly-torch.txt
@@ -29,8 +29,8 @@ opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.5
-tokenizers==0.22.0
+transformers==5.5.3
+tokenizers==0.22.2
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes>=0.49.2
diff --git a/requirements/test/rocm.in b/requirements/test/rocm.in
index b5a9451b36f7..dbb1500edcf7 100644
--- a/requirements/test/rocm.in
+++ b/requirements/test/rocm.in
@@ -38,8 +38,8 @@ opencv-python-headless>=4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.5
-tokenizers==0.22.0
+transformers==5.5.3
+tokenizers==0.22.2
 schemathesis>=3.39.15 # Required for openai schema test
 # quantization
 bitsandbytes==0.49.2
@@ -82,4 +82,3 @@ plotly # required for perf comparison html report
 rapidfuzz
 torchgeo==0.7.0
 multiprocess==0.70.16
-huggingface-hub==0.36.2
diff --git a/requirements/test/rocm.txt b/requirements/test/rocm.txt
index e1efae912ee4..ba9cd3dfdcf3 100644
--- a/requirements/test/rocm.txt
+++ b/requirements/test/rocm.txt
@@ -39,7 +39,7 @@ annotated-doc==0.0.4
     #   typer
 annotated-types==0.7.0
     # via pydantic
-anthropic==0.89.0
+anthropic==0.93.0
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
@@ -172,7 +172,7 @@ colorful==0.5.8
     # via ray
 colorlog==6.10.1
     # via optuna
-compressed-tensors==0.14.0.1
+compressed-tensors==0.15.0.1
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
@@ -269,9 +269,9 @@ fastapi==0.135.2
     #   model-hosting-container-standards
 fastapi-cli==0.0.24
     # via fastapi
-fastapi-cloud-cli==0.15.1
+fastapi-cloud-cli==0.16.1
     # via fastapi-cli
-fastar==0.9.0
+fastar==0.10.0
     # via fastapi-cloud-cli
 fastparquet==2026.3.0
     # via genai-perf
@@ -290,7 +290,6 @@ filelock==3.25.2
     #   python-discovery
     #   ray
     #   torch
-    #   transformers
     #   virtualenv
 fiona==1.10.1
     # via torchgeo
@@ -384,7 +383,7 @@ h5py==3.16.0
     # via terratorch
 harfile==0.4.0
     # via schemathesis
-hf-xet==1.4.2
+hf-xet==1.4.3
     # via huggingface-hub
 hiredis==3.3.1
     # via tensorizer
@@ -403,6 +402,7 @@ httpx==0.27.2
     #   diffusers
     #   fastapi
     #   fastapi-cloud-cli
+    #   huggingface-hub
     #   mcp
     #   model-hosting-container-standards
     #   openai
@@ -410,9 +410,8 @@ httpx==0.27.2
     #   schemathesis
 httpx-sse==0.4.3
     # via mcp
-huggingface-hub==0.36.2
+huggingface-hub==1.10.2
     # via
-    #   -r requirements/test/rocm.in
     #   accelerate
     #   datasets
     #   diffusers
@@ -484,7 +483,7 @@ jinja2==3.1.6
     #   genai-perf
     #   lm-eval
     #   torch
-jiter==0.13.0
+jiter==0.14.0
     # via
     #   anthropic
     #   openai
@@ -631,7 +630,7 @@ msgpack==1.1.2
     # via
     #   librosa
     #   ray
-msgspec==0.20.0
+msgspec==0.21.0
     # via -r requirements/test/../common.txt
 mteb==2.11.5
     # via -r requirements/test/rocm.in
@@ -742,7 +741,7 @@ omegaconf==2.3.0
     #   lightning
 open-clip-torch==2.32.0
     # via -r requirements/test/rocm.in
-openai==2.30.0
+openai==2.31.0
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
@@ -1093,7 +1092,7 @@ python-dotenv==1.2.2
     #   uvicorn
 python-json-logger==4.1.0
     # via -r requirements/test/../common.txt
-python-multipart==0.0.22
+python-multipart==0.0.26
     # via
     #   fastapi
     #   mcp
@@ -1180,7 +1179,6 @@ requests==2.32.5
     #   google-api-core
     #   google-cloud-storage
     #   gpt-oss
-    #   huggingface-hub
     #   lightly
     #   lm-eval
     #   mistral-common
@@ -1194,7 +1192,6 @@ requests==2.32.5
     #   starlette-testclient
     #   tacoreader
     #   tiktoken
-    #   transformers
     #   wandb
 resampy==0.4.3
     # via -r requirements/test/rocm.in
@@ -1428,7 +1425,7 @@ timm==1.0.17
     #   segmentation-models-pytorch
     #   terratorch
     #   torchgeo
-tokenizers==0.22.0
+tokenizers==0.22.2
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
@@ -1471,7 +1468,7 @@ tqdm==4.67.3
     #   tacoreader
     #   terratorch
     #   transformers
-transformers==4.57.5
+transformers==5.5.3
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
@@ -1498,7 +1495,9 @@ typer==0.24.1
     #   fastapi-cli
     #   fastapi-cloud-cli
     #   fastsafetensors
+    #   huggingface-hub
     #   perceptron
+    #   transformers
 typeshed-client==2.9.0
     # via jsonargparse
 typing-extensions==4.15.0
diff --git a/requirements/test/xpu.in b/requirements/test/xpu.in
index 0e4ca1d99dca..94ffc249395a 100644
--- a/requirements/test/xpu.in
+++ b/requirements/test/xpu.in
@@ -13,7 +13,6 @@ pytest-shard
 absl-py
 accelerate
 arctic-inference
-hf_transfer
 lm_eval[api]
 modelscope
 
diff --git a/requirements/test/xpu.txt b/requirements/test/xpu.txt
index 51810592c46f..4ddc0aa1c922 100644
--- a/requirements/test/xpu.txt
+++ b/requirements/test/xpu.txt
@@ -19,7 +19,9 @@ aiosignal==1.4.0
 albumentations==1.4.6
     # via -r requirements/test/xpu.in
 annotated-doc==0.0.4
-    # via fastapi
+    # via
+    #   fastapi
+    #   typer
 annotated-types==0.7.0
     # via pydantic
 anyio==4.13.0
@@ -64,6 +66,7 @@ click==8.3.1
     #   jiwer
     #   nltk
     #   schemathesis
+    #   typer
     #   uvicorn
 colorama==0.4.6
     # via sacrebleu
@@ -112,7 +115,6 @@ filelock==3.25.2
     #   huggingface-hub
     #   modelscope
     #   torch
-    #   transformers
 frozenlist==1.8.0
     # via
     #   aiohttp
@@ -133,9 +135,7 @@ h11==0.16.0
     #   uvicorn
 harfile==0.4.0
     # via schemathesis
-hf-transfer==0.1.9
-    # via -r requirements/test/xpu.in
-hf-xet==1.4.2
+hf-xet==1.4.3
     # via huggingface-hub
 html2text==2025.4.15
     # via gpt-oss
@@ -144,8 +144,9 @@ httpcore==1.0.9
 httpx==0.28.1
     # via
     #   datasets
+    #   huggingface-hub
     #   schemathesis
-huggingface-hub==0.36.2
+huggingface-hub==1.10.2
     # via
     #   accelerate
     #   datasets
@@ -515,7 +516,6 @@ requests==2.33.1
     #   docker
     #   evaluate
     #   gpt-oss
-    #   huggingface-hub
     #   lm-eval
     #   mistral-common
     #   modelscope
@@ -524,11 +524,11 @@ requests==2.33.1
     #   schemathesis
     #   starlette-testclient
     #   tiktoken
-    #   transformers
 rich==14.3.3
     # via
     #   mteb
     #   schemathesis
+    #   typer
 rouge-score==0.1.2
     # via lm-eval
 rpds-py==0.30.0
@@ -572,6 +572,8 @@ setuptools==80.10.2
     #   modelscope
     #   pytablewriter
     #   torch
+shellingham==1.5.4
+    # via typer
 six==1.17.0
     # via
     #   -c requirements/common.txt
@@ -665,7 +667,7 @@ tqdm==4.67.3
     #   pqdm
     #   sentence-transformers
     #   transformers
-transformers==4.57.6
+transformers==5.5.3
     # via
     #   -c requirements/common.txt
     #   sentence-transformers
@@ -676,6 +678,10 @@ typepy==1.3.4
     #   dataproperty
     #   pytablewriter
     #   tabledata
+typer==0.24.1
+    # via
+    #   huggingface-hub
+    #   transformers
 typing-extensions==4.15.0
     # via
     #   -c requirements/common.txt
diff --git a/tests/conftest.py b/tests/conftest.py
index a666c5a86637..bc657ff1ca79 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -410,6 +410,15 @@ def _init(
             model_name,
             trust_remote_code=trust_remote_code,
         )
+        # HF runner should use the HF config so that it's consistent with the HF model
+        if self.config.__module__.startswith("vllm.transformers_utils.configs"):
+            from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+
+            del CONFIG_MAPPING._extra_content[self.config.model_type]
+            self.config = AutoConfig.from_pretrained(
+                model_name,
+                trust_remote_code=trust_remote_code,
+            )
         self.device = self.get_default_device()
         self.dtype = dtype = _get_and_verify_dtype(
             self.model_name,
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 20944a9111e0..169ddbf7ce5c 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -3,6 +3,7 @@
 
 import tempfile
 from collections import OrderedDict
+from importlib import reload
 from unittest.mock import MagicMock
 
 import pytest
@@ -47,6 +48,11 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
 def maybe_enable_lora_dual_stream(monkeypatch: pytest.MonkeyPatch):
     if current_platform.is_cuda():
         monkeypatch.setenv("VLLM_LORA_ENABLE_DUAL_STREAM", "1")
+        import vllm.lora.layers.base_linear
+
+        if not hasattr(vllm.lora.layers.base_linear, "lora_linear_async"):
+            # Reload the module to ensure the environment variable takes effect.
+            reload(vllm.lora.layers.base_linear)
     yield
 
 
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index e430826461a1..3d6484a710a6 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from importlib.metadata import version
+
 import pytest
+from packaging.version import Version
 
 import vllm
 from vllm.assets.image import ImageAsset
@@ -10,6 +13,14 @@
 
 from ..utils import multi_gpu_test
 
+pytestmark = pytest.mark.skipif(
+    Version("5.0") <= Version(version("transformers")),
+    reason=(
+        "MiniCPMV custom processor uses tokenizer.im_start_id which is not "
+        "available on TokenizersBackend in transformers v5.0+"
+    ),
+)
+
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
 
 PROMPT_TEMPLATE = (
diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py
index 93535ae0aacd..260ebdcefb3b 100644
--- a/tests/model_executor/test_weight_utils.py
+++ b/tests/model_executor/test_weight_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import os
 import tempfile
 
 import huggingface_hub.constants
@@ -10,26 +9,10 @@
 
 from vllm.model_executor.model_loader.weight_utils import (
     download_weights_from_hf,
-    enable_hf_transfer,
     maybe_remap_kv_scale_name,
 )
 
 
-def test_hf_transfer_auto_activation():
-    if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
-        # in case it is already set, we can't test the auto activation
-        pytest.skip("HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
-    enable_hf_transfer()
-    try:
-        # enable hf hub transfer if available
-        import hf_transfer  # type: ignore # noqa
-
-        HF_TRANSFER_ACTIVE = True
-    except ImportError:
-        HF_TRANSFER_ACTIVE = False
-    assert huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == HF_TRANSFER_ACTIVE
-
-
 def test_download_weights_from_hf():
     with tempfile.TemporaryDirectory() as tmpdir:
         # assert LocalEntryNotFoundError error is thrown
@@ -178,5 +161,4 @@ def test_missing_target_returns_none(self):
 
 
 if __name__ == "__main__":
-    test_hf_transfer_auto_activation()
     test_download_weights_from_hf()
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index c524480839bc..b276f37a2a33 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -143,6 +143,11 @@ def test_models(
         # in parts of the operators
         pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
 
+    if current_platform.is_cpu() and model == "TitanML/tiny-mixtral":
+        # This untrained model is sensitive to the rounding error
+        # Fuse ops to reduce bfloat16 rounding
+        monkeypatch.setenv("VLLM_CPU_CI_ENV", "0")
+
     with hf_runner(model) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs
diff --git a/tests/models/language/pooling_mteb_test/test_baai.py b/tests/models/language/pooling_mteb_test/test_baai.py
index 1199393d4b74..ec11960fda07 100644
--- a/tests/models/language/pooling_mteb_test/test_baai.py
+++ b/tests/models/language/pooling_mteb_test/test_baai.py
@@ -69,7 +69,10 @@
         attn_type="decoder",
         is_prefix_caching_supported=True,
         is_chunked_prefill_supported=True,
-        enable_test=True,
+        # Skip: model's custom tokenizer on HF hub is incompatible with
+        # transformers v5 (sets attrs before super().__init__, triggering
+        # AttributeError on 'verbose' in __getattr__).
+        enable_test=False,
     ),
 ]
 
diff --git a/tests/models/language/pooling_mteb_test/test_gte.py b/tests/models/language/pooling_mteb_test/test_gte.py
index 0c35d66c3667..0a54262e124f 100644
--- a/tests/models/language/pooling_mteb_test/test_gte.py
+++ b/tests/models/language/pooling_mteb_test/test_gte.py
@@ -72,7 +72,8 @@
         attn_type="encoder_only",
         is_prefix_caching_supported=False,
         is_chunked_prefill_supported=False,
-        enable_test=True,
+        # Skip: numerical regression with transformers v5.
+        enable_test=False,
     ),
     ########## ModernBertModel
     EmbedModelInfo(
diff --git a/tests/models/language/pooling_mteb_test/test_jina.py b/tests/models/language/pooling_mteb_test/test_jina.py
index 627cc0431943..d75ec2a2acec 100644
--- a/tests/models/language/pooling_mteb_test/test_jina.py
+++ b/tests/models/language/pooling_mteb_test/test_jina.py
@@ -75,6 +75,10 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
     mteb_test_rerank_models(vllm_runner, model_info)
 
 
+@pytest.mark.skip(
+    reason="jinaai/jina-embeddings-v3 custom XLMRobertaLoRA model on HF hub "
+    "is incompatible with transformers v5 (missing all_tied_weights_keys)"
+)
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("dimensions", [16, 32])
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index bf5119cf44f4..1147ccef35b4 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -186,7 +186,14 @@
         max_num_seqs=2,
         auto_cls=AutoModel,
         hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
-        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        marks=[
+            pytest.mark.core_model,
+            pytest.mark.cpu_model,
+            # TODO: Remove skip once model has been upstreamed to Transformers
+            pytest.mark.skip(
+                reason="Custom model code is not compatible with Transformers v5"
+            ),
+        ],
     ),
     #### Transformers fallback to test
     ## To reduce test burden, we only test batching arbitrary image size
@@ -397,14 +404,14 @@
     "gemma4": VLMTestInfo(
         models=["google/gemma-4-E2B-it"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<bos><|turn>user\n{img_prompt}<turn|>\n<|turn>model\n",  # noqa: E501
         single_image_prompts=IMAGE_ASSETS.prompts(
             {
-                "stop_sign": "What's the content in the center of the image?",
-                "cherry_blossom": "What is the season?",
+                "stop_sign": "<|image|>What's the content in the center of the image?",  # noqa: E501
+                "cherry_blossom": "<|image|>What is the season?",
             }
         ),
-        multi_image_prompt="Describe the two images in detail.",
+        multi_image_prompt="<|image|><|image|>Describe the two images in detail.",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
@@ -533,6 +540,12 @@
         max_model_len=4096,
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
+        # TODO: Remove skip once model has been upstreamed to Transformers
+        marks=[
+            pytest.mark.skip(
+                reason="Custom model code tries to access data from meta-tensor"
+            )
+        ],
     ),
     "intern_vl-video": VLMTestInfo(
         models=[
@@ -545,6 +558,12 @@
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
         num_logprobs=10 if current_platform.is_rocm() else 5,
+        # TODO: Remove skip once model has been upstreamed to Transformers
+        marks=[
+            pytest.mark.skip(
+                reason="Custom model code tries to access data from meta-tensor"
+            )
+        ],
     ),
     "intern_vl-hf": VLMTestInfo(
         models=["OpenGVLab/InternVL3-1B-hf"],
@@ -591,6 +610,8 @@
         hf_model_kwargs={"device_map": "auto"},
         patch_hf_runner=model_utils.isaac_patch_hf_runner,
         image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        # TODO: Remove skip once model has been upstreamed to Transformers
+        marks=[pytest.mark.skip(reason="Custom model imports deleted object")],  # noqa: E501
     ),
     "kimi_vl": VLMTestInfo(
         models=["moonshotai/Kimi-VL-A3B-Instruct"],
@@ -806,7 +827,12 @@
             pytest.mark.skipif(
                 Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
                 reason="This model is broken in Transformers v4.57.3",
-            )
+            ),
+            pytest.mark.skipif(
+                Version(TRANSFORMERS_VERSION) >= Version("5.0.0"),
+                reason="Model's custom code uses ROPE_INIT_FUNCTIONS"
+                "['default'] which was removed in transformers v5",
+            ),
         ],
     ),
     "phi3v": VLMTestInfo(
@@ -960,6 +986,12 @@
             )
             for inp in custom_inputs.different_patch_input_cases_internvl()
         ],
+        # TODO: Remove skip once model has been upstreamed to Transformers
+        marks=[
+            pytest.mark.skip(
+                reason="Custom model code tries to access data from meta-tensor"
+            )
+        ],
     ),
     "llava_onevision-multiple-images": VLMTestInfo(
         models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
diff --git a/tests/models/multimodal/generation/test_nemotron_parse.py b/tests/models/multimodal/generation/test_nemotron_parse.py
index e224f31e6df9..8159cc9a8dae 100644
--- a/tests/models/multimodal/generation/test_nemotron_parse.py
+++ b/tests/models/multimodal/generation/test_nemotron_parse.py
@@ -103,6 +103,10 @@ def run_test(
         )
 
 
+@pytest.mark.skip(
+    reason="Model's custom MBart decoder has head count mismatch with "
+    "transformers v5's GQA-aware cross-attention (8 vs 16 heads)"
+)
 @pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("num_logprobs", [5])
diff --git a/tests/models/multimodal/generation/test_phi4siglip.py b/tests/models/multimodal/generation/test_phi4siglip.py
index e8f4ba829250..f80b16c341b6 100644
--- a/tests/models/multimodal/generation/test_phi4siglip.py
+++ b/tests/models/multimodal/generation/test_phi4siglip.py
@@ -2,9 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
+from importlib.metadata import version
 
 import pytest
 import regex as re
+from packaging.version import Version
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from vllm.logprobs import SampleLogprobs
@@ -19,6 +21,15 @@
 from ....utils import multi_gpu_test
 from ...utils import check_logprobs_close
 
+pytestmark = pytest.mark.skipif(
+    Version("5.0") <= Version(version("transformers")),
+    reason=(
+        "vllm upgraded transformers above v5.4 where HF model custom code uses siglip2 "
+        "internals (filter_out_non_signature_kwargs) removed by "
+        "huggingface/transformers#43514"
+    ),
+)
+
 MODEL_ID = "microsoft/Phi-4-reasoning-vision-15B"
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py
index 590b549dcf59..82db1dc6812c 100644
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -149,6 +149,10 @@ def _asset_to_openai_chunk(asset):
     )
 
 
+@pytest.mark.skip(
+    reason="VoxtralProcessor.apply_chat_template() in transformers v5 "
+    "doesn't resolve chat_template=None to the default template"
+)
 def test_hf_reference(hf_runner, vllm_runner, audio_assets: AudioTestAssets):
     """Compare vLLM Mistral-format output against HF Transformers reference.
 
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index 3de4ca209a6f..ae95f39586c0 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -80,6 +80,11 @@ def run_test(
     if vllm_runner_kwargs:
         vllm_runner_kwargs_.update(vllm_runner_kwargs)
 
+    # Avoid passing limit_mm_per_prompt twice when vllm_runner_kwargs
+    # already contains it (e.g. gemma4 sets it via vllm_runner_kwargs).
+    if "limit_mm_per_prompt" in vllm_runner_kwargs_:
+        limit_mm_per_prompt = vllm_runner_kwargs_.pop("limit_mm_per_prompt")
+
     with vllm_runner(
         model,
         max_model_len=max_model_len,
diff --git a/tests/models/multimodal/pooling/test_colqwen3.py b/tests/models/multimodal/pooling/test_colqwen3.py
index 2faac7fbfb61..9eefedc153c2 100644
--- a/tests/models/multimodal/pooling/test_colqwen3.py
+++ b/tests/models/multimodal/pooling/test_colqwen3.py
@@ -22,6 +22,11 @@
 
 from ....conftest import VllmRunner
 
+pytestmark = pytest.mark.skip(
+    reason="ColQwen3 model's weight tying is incompatible with "
+    "transformers v5 (missing all_tied_weights_keys)"
+)
+
 MODELS = [
     "TomoroAI/tomoro-colqwen3-embed-4b",
     "OpenSearch-AI/Ops-Colqwen3-4B",
diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py
index c3f7c81b78bd..d7b67b8bdb6a 100644
--- a/tests/models/multimodal/pooling/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -12,6 +12,11 @@
 
 from ....conftest import ImageTestAssets
 
+pytestmark = pytest.mark.skip(
+    reason="InternVisionModel's custom code is incompatible with "
+    "transformers v5 (missing all_tied_weights_keys)"
+)
+
 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
 DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
diff --git a/tests/models/multimodal/pooling/test_jinavl_reranker.py b/tests/models/multimodal/pooling/test_jinavl_reranker.py
index 035ca62058a8..18a02625ea44 100644
--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
@@ -15,6 +15,11 @@
 
 from ....conftest import HfRunner, VllmRunner
 
+pytestmark = pytest.mark.skip(
+    reason="jinaai/jina-reranker-m0 custom code is incompatible with "
+    "transformers v5 (missing all_tied_weights_keys)"
+)
+
 MODELS = ["jinaai/jina-reranker-m0"]
 
 MM_PROCESSOR_KWARGS = {
diff --git a/tests/models/multimodal/processing/test_musicflamingo.py b/tests/models/multimodal/processing/test_musicflamingo.py
index 625e1ad8d29b..ba14b7760299 100644
--- a/tests/models/multimodal/processing/test_musicflamingo.py
+++ b/tests/models/multimodal/processing/test_musicflamingo.py
@@ -17,11 +17,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from importlib.metadata import version
 from unittest.mock import MagicMock
 
 import numpy as np
 import pytest
 import torch
+from packaging.version import Version
 from transformers import PretrainedConfig
 
 from tests.models.registry import HF_EXAMPLE_MODELS
@@ -122,6 +124,11 @@ def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):
     assert builder.get_dummy_text({"audio": 2}) == "<sound><sound>"
 
 
+@pytest.mark.skipif(
+    Version(version("transformers")) >= Version("5.5"),
+    reason="transformers v5.5 added native MusicFlamingoForConditionalGeneration "
+    "with a different get_audio_features signature (requires input_ids)",
+)
 def test_musicflamingo_audio_feature_pipeline_matches_hf_small_config():
     from transformers.models.musicflamingo import (
         modeling_musicflamingo as hf_musicflamingo_modeling,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 92ebac018412..a93dc26307b0 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -335,7 +335,15 @@ def check_available_online(
         "internlm/internlm2-chat-7b", trust_remote_code=True
     ),
     "InternLM2VEForCausalLM": _HfExamplesInfo(
-        "OpenGVLab/Mono-InternVL-2B", trust_remote_code=True
+        "OpenGVLab/Mono-InternVL-2B",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": (
+                "Custom config cannot be loaded with Transformers "
+                "v5 because `vision_config` is not always set"
+            )
+        },
     ),
     "InternLM3ForCausalLM": _HfExamplesInfo(
         "internlm/internlm3-8b-instruct", trust_remote_code=True
@@ -475,6 +483,13 @@ def check_available_online(
     "Plamo2ForCausalLM": _HfExamplesInfo(
         "pfnet/plamo-2-1b",
         trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "hf": (
+                "Custom model code uses `_tied_weight_keys: list[str]` but "
+                "Transformers v5 now expects `_tied_weight_keys: dict[str, str]`"
+            )
+        },
     ),
     "Plamo3ForCausalLM": _HfExamplesInfo(
         "pfnet/plamo-3-nict-2b-base",
@@ -515,6 +530,13 @@ def check_available_online(
         trust_remote_code=True,
         max_model_len=4096,
         is_available_online=True,
+        max_transformers_version="5.3",
+        transformers_version_reason={
+            "vllm": (
+                "vllm upgraded transformers above v5.4 where "
+                "validate_rope() no longer accepts ignore_keys param"
+            )
+        },
     ),
     "SeedOssForCausalLM": _HfExamplesInfo(
         "ByteDance-Seed/Seed-OSS-36B-Instruct",
@@ -553,6 +575,11 @@ def check_available_online(
         "xverse/XVERSE-7B-Chat",
         tokenizer="meta-llama/Llama-2-7b",
         trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "XVERSE tokenizer is incompatible with transformers v5 "
+            "(add_prefix_space / prepend_scheme mismatch).",
+        },
     ),
     "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
     "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True),
@@ -763,10 +790,18 @@ def check_available_online(
     # [Decoder-only]
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
     "AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
-        "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0"
+        "nvidia/audio-flamingo-3-hf",
+        min_transformers_version="5.3.0",
+        transformers_version_reason={
+            "vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
+        },
     ),
     "MusicFlamingoForConditionalGeneration": _HfExamplesInfo(
-        "nvidia/music-flamingo-2601-hf", min_transformers_version="5.3.0"
+        "nvidia/music-flamingo-2601-hf",
+        min_transformers_version="5.3.0",
+        transformers_version_reason={
+            "vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
+        },
     ),
     "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
     "BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
@@ -821,12 +856,30 @@ def check_available_online(
     ),
     "FireRedASR2ForConditionalGeneration": _HfExamplesInfo(
         "allendou/FireRedASR2-LLM-vllm",
+        trust_remote_code=True,
+        max_transformers_version="5.1",
+        transformers_version_reason={
+            "vllm": "Incompatible with transformers v5.2+ "
+            "(dict object has no attribute '__name__').",
+        },
     ),
     "FireRedLIDForConditionalGeneration": _HfExamplesInfo(
         "PatchyTisa/FireRedLID-vllm",
+        trust_remote_code=True,
+        max_transformers_version="5.1",
+        transformers_version_reason={
+            "vllm": "Incompatible with transformers v5.2+ "
+            "(dict object has no attribute '__name__').",
+        },
     ),
     "FunASRForConditionalGeneration": _HfExamplesInfo(
         "allendou/Fun-ASR-Nano-2512-vllm",
+        trust_remote_code=True,
+        max_transformers_version="5.1",
+        transformers_version_reason={
+            "vllm": "Incompatible with transformers v5.2+ "
+            "(dict object has no attribute '__name__').",
+        },
     ),
     "FunAudioChatForConditionalGeneration": _HfExamplesInfo(
         "funaudiochat", is_available_online=False
@@ -868,6 +921,13 @@ def check_available_online(
     "HCXVisionForCausalLM": _HfExamplesInfo(
         "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
         trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": (
+                "Custom config cannot be loaded with Transformers "
+                "v5 because `text_config` is not always set"
+            )
+        },
     ),
     "HCXVisionV2ForCausalLM": _HfExamplesInfo(
         "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
@@ -887,7 +947,12 @@ def check_available_online(
         extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"},
     ),
     "InternS1ForConditionalGeneration": _HfExamplesInfo(
-        "internlm/Intern-S1", trust_remote_code=True
+        "internlm/Intern-S1",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "Custom tokenizer code is not compatible with Transformers v5."
+        },
     ),
     "InternS1ProForConditionalGeneration": _HfExamplesInfo(
         "internlm/Intern-S1-Pro",
@@ -976,7 +1041,14 @@ def check_available_online(
     "MiDashengLMModel": _HfExamplesInfo(
         "mispeech/midashenglm-7b", trust_remote_code=True
     ),
-    "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", trust_remote_code=True),
+    "MiniCPMO": _HfExamplesInfo(
+        "openbmb/MiniCPM-o-2_6",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "hf": "Custom processor code is not compatible with Transformers v5."
+        },
+    ),
     "MiniCPMV": _HfExamplesInfo(
         "openbmb/MiniCPM-Llama3-V-2_5",
         extras={
@@ -984,6 +1056,13 @@ def check_available_online(
             "4.0": "openbmb/MiniCPM-V-4",
             "4.5": "openbmb/MiniCPM-V-4_5",
         },
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": (
+                "MiniCPMVBatchFeature is incompatible with its base class in "
+                "Transformers v5. See https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/discussions/78"
+            )
+        },
         trust_remote_code=True,
     ),
     "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo(
@@ -1039,13 +1118,25 @@ def check_available_online(
         trust_remote_code=True,
     ),
     "OpenCUAForConditionalGeneration": _HfExamplesInfo(
-        "xlangai/OpenCUA-7B", trust_remote_code=True
+        "xlangai/OpenCUA-7B",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "Tokenizer cannot be initialised in Transformers v5."
+        },
     ),
     "OpenPanguVLForConditionalGeneration": _HfExamplesInfo(
         "FreedomIntelligence/openPangu-VL-7B",
         trust_remote_code=True,
         max_model_len=4096,
         enforce_eager=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": (
+                "OpenPanguVLVideoProcessorInitKwargs does not specify total=False, "
+                "making all kwargs required. See https://huggingface.co/FreedomIntelligence/openPangu-VL-7B/discussions/2"
+            )
+        },
     ),
     "Ovis": _HfExamplesInfo(
         "AIDC-AI/Ovis2-1B",
@@ -1057,12 +1148,24 @@ def check_available_online(
             "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B",
         },
     ),
-    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
+    "Ovis2_5": _HfExamplesInfo(
+        "AIDC-AI/Ovis2.5-2B",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "Custom processor code is not compatible with Transformers v5."
+        },
+    ),
     "Ovis2_6ForCausalLM": _HfExamplesInfo(
         "AIDC-AI/Ovis2.6-2B", is_available_online=False, trust_remote_code=True
     ),
     "Ovis2_6_MoeForCausalLM": _HfExamplesInfo(
-        "AIDC-AI/Ovis2.6-30B-A3B", trust_remote_code=True
+        "AIDC-AI/Ovis2.6-30B-A3B",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "Custom processor code is not compatible with Transformers v5."
+        },
     ),
     "PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
         "PaddlePaddle/PaddleOCR-VL",
@@ -1082,7 +1185,17 @@ def check_available_online(
         extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"},
     ),
     "Phi4ForCausalLMV": _HfExamplesInfo(
-        "microsoft/Phi-4-reasoning-vision-15B", trust_remote_code=True
+        "microsoft/Phi-4-reasoning-vision-15B",
+        trust_remote_code=True,
+        max_transformers_version="5.3",
+        transformers_version_reason={
+            "vllm": (
+                "vllm upgraded transformers above v5.4 where HF model "
+                "custom code uses siglip2 internals "
+                "(filter_out_non_signature_kwargs) removed "
+                "by huggingface/transformers#43514"
+            )
+        },
     ),
     "Phi4MMForCausalLM": _HfExamplesInfo(
         "microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
@@ -1179,6 +1292,14 @@ def check_available_online(
             "architectures": ["Tarsier2ForConditionalGeneration"],
             "model_type": "tarsier2",
         },
+        max_transformers_version="5.3",
+        transformers_version_reason={
+            "vllm": (
+                "Qwen2VLConfig was split into Qwen2VLConfig + "
+                "Qwen2VLTextConfig in transformers v5, breaking "
+                "attribute access (num_attention_heads, hidden_size, etc.)"
+            )
+        },
     ),
     "VoxtralForConditionalGeneration": _HfExamplesInfo(
         "mistralai/Voxtral-Mini-3B-2507",
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 3b94f34fab08..b93beee6aa3a 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -476,7 +476,16 @@ def dummy_hf_overrides(
     else:
         # Use minimal layers for testing
         num_layers = 1
-        num_hidden_layers = 3 if model_arch == "Gemma3nForConditionalGeneration" else 1
+        num_hidden_layers = (
+            3
+            if model_arch
+            in (
+                "Gemma3nForConditionalGeneration",
+                "Gemma4ForCausalLM",
+                "Gemma4ForConditionalGeneration",
+            )
+            else 1
+        )
 
     update_dict = {
         "num_layers": num_layers,
diff --git a/tests/reasoning/test_step3p5_reasoning_parser.py b/tests/reasoning/test_step3p5_reasoning_parser.py
index 2196d247cb45..8f62e7a2cb4d 100644
--- a/tests/reasoning/test_step3p5_reasoning_parser.py
+++ b/tests/reasoning/test_step3p5_reasoning_parser.py
@@ -2,10 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
-from transformers import AutoTokenizer
 
 from tests.reasoning.utils import run_reasoning_extraction
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.tokenizers import get_tokenizer
 
 parser_name = "step3p5"
 start_token = "<think>"
@@ -16,7 +16,7 @@
 
 @pytest.fixture(scope="module")
 def step3p5_tokenizer():
-    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+    return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME)
 
 
 SIMPLE_REASONING = {
diff --git a/tests/v1/e2e/spec_decode/test_spec_decode.py b/tests/v1/e2e/spec_decode/test_spec_decode.py
index c11bdbc50f70..a8fed7665282 100644
--- a/tests/v1/e2e/spec_decode/test_spec_decode.py
+++ b/tests/v1/e2e/spec_decode/test_spec_decode.py
@@ -557,12 +557,16 @@ def test_eagle_correctness_light(
             "auto",
             0.8,
         ),
-        (
+        pytest.param(
             ("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1),
             False,
             False,
             "transformers",
             0.8,
+            # TODO(hmellor): figure out why memory usage is so high
+            marks=pytest.mark.skip(
+                reason="Feature is experimental and uses too much memory in CI",
+            ),
         ),
         pytest.param(
             (
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index ce6a813b8da5..fc6f88b49ee1 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -265,12 +265,24 @@ def find_hf_name_in_tensor_map(hf_name: str) -> str | None:
                 GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
                 or None if no mapping found
             """
+            # In transformers v5, multimodal models (e.g. Gemma3) wrap
+            # all sub-models under an outer 'model.' attribute, producing
+            # state_dict keys like 'model.language_model.layers.0...' and
+            # 'model.vision_tower.vision_model...'.  Strip this outer
+            # prefix so the keys match what gguf-py expects.
+            if is_multimodal and hf_name.startswith("model."):
+                hf_name = hf_name[6:]  # Remove outer 'model.'
+
             # Strip 'language_model.' prefix for multimodal models - gguf-py
             # tensor mappings expect parameter names without this prefix.
             # Note: 'model.' prefix should be KEPT for text-only models as
             # gguf-py expects it.
             if hf_name.startswith("language_model."):
                 hf_name = hf_name[15:]  # Remove 'language_model.'
+                # Re-add 'model.' prefix because gguf-py text tensor maps
+                # expect 'model.layers...' format.
+                if is_multimodal:
+                    hf_name = "model." + hf_name
 
             # Parse parameter name and suffix
             if hf_name.endswith((".weight", ".bias")):
diff --git a/vllm/model_executor/models/gemma4_mm.py b/vllm/model_executor/models/gemma4_mm.py
index e22f23c5c8bc..73078e169887 100644
--- a/vllm/model_executor/models/gemma4_mm.py
+++ b/vllm/model_executor/models/gemma4_mm.py
@@ -125,8 +125,12 @@ class Gemma4AudioInputs(TensorSchema):
     """
 
     type: Literal["audio"] = "audio"
-    input_features_padded: Annotated[torch.Tensor, TensorShape("bn", "s", "f")]
-    input_features_mask: Annotated[torch.Tensor, TensorShape("bn", "s")]
+    input_features_padded: Annotated[
+        torch.Tensor, TensorShape("bn", "s", "f", dynamic_dims={"s"})
+    ]
+    input_features_mask: Annotated[
+        torch.Tensor, TensorShape("bn", "s", dynamic_dims={"s"})
+    ]
 
 
 Gemma4ImageInputs = Gemma4ImagePixelInputs
@@ -505,6 +509,8 @@ def _call_hf_processor(
             video_timestamps_per_video: list[list[float]] = []
             video_frame_counts: list[int] = []
 
+            video_replacements: list[str] = []
+
             for item in videos:
                 video_array, metadata = item
 
@@ -557,10 +563,7 @@ def _call_hf_processor(
                 video_timestamps_per_video.append(timestamps)
                 video_frame_counts.append(len(frames))
 
-                # Build expanded replacement text and replace the
-                # <|video|> placeholder in the prompt.
-                # Use split(token, 1) to avoid collision — the
-                # replacement text itself contains <|video|> tokens.
+                # Build expanded replacement text for this video.
                 ts_strs = [f"{int(s // 60):02d}:{int(s % 60):02d}" for s in timestamps]
                 replacement = " ".join(
                     f"{t} {processor.boi_token}"
@@ -568,9 +571,23 @@ def _call_hf_processor(
                     f"{processor.eoi_token}"
                     for t, n in zip(ts_strs, num_soft_per_frame)
                 )
-                parts = prompt.split(processor.video_token, 1)
-                if len(parts) == 2:
-                    prompt = parts[0] + replacement + parts[1]
+                video_replacements.append(replacement)
+
+            # Replace all <|video|> placeholders at once. We split on
+            # video_token to get N+1 parts, then interleave with the
+            # N replacement strings. This avoids the iterative
+            # split-replace bug where replacement text (which itself
+            # contains <|video|> tokens) collides with later splits.
+            vt = processor.video_token
+            parts = prompt.split(vt, len(video_replacements))
+
+            # NOTE: len(parts) <= len(video_replacements) + 1
+            parts_with_repl: list[str] = []
+            for part, repl in zip(parts, video_replacements):
+                parts_with_repl.extend([part, repl])
+            parts_with_repl.extend(parts[len(video_replacements) :])
+
+            prompt = "".join(parts_with_repl)
 
             video_outputs = {
                 "pixel_values_videos": torch.cat(all_video_pixel_values, dim=0),
@@ -633,19 +650,23 @@ def _call_hf_processor(
             )
 
         if "input_features" in processed_outputs:
-            # Keep padded features for batched audio tower execution.
-            processed_outputs["input_features_padded"] = processed_outputs[
-                "input_features"
-            ]
-            # Unpad per-item so each item's cache entry is self-contained.
+            # Unpad per-item so each item's cache entry is
+            # self-contained. The batched() field config in
+            # _get_mm_fields_config will re-pad all fields to the
+            # batch's max length at batch time, ensuring consistent
+            # padding regardless of cache history.
+            masks = processed_outputs["input_features_mask"]
             unpadded_features = [
                 f[mask]
                 for f, mask in zip(
                     processed_outputs["input_features"],
-                    processed_outputs["input_features_mask"],
+                    masks,
                 )
             ]
+            unpadded_masks = [mask[mask] for mask in masks]
             processed_outputs["input_features"] = unpadded_features
+            processed_outputs["input_features_padded"] = unpadded_features
+            processed_outputs["input_features_mask"] = unpadded_masks
 
         # Merge video outputs into the final result
         combined_outputs = dict(processed_outputs, **video_outputs)
diff --git a/vllm/model_executor/models/musicflamingo.py b/vllm/model_executor/models/musicflamingo.py
index f4e3bbe379a3..497b2e63a7e9 100644
--- a/vllm/model_executor/models/musicflamingo.py
+++ b/vllm/model_executor/models/musicflamingo.py
@@ -32,9 +32,9 @@
 
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
+from vllm.inputs import MultiModalDataDict
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
-    MultiModalDataDict,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
 )
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index b7967985f222..a3e4b844b805 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -275,6 +275,11 @@ def _decorate_cls_for_torch_compile(
         )
         class SupportTorchCompileWrapper(cls): ...
 
+        # Preserve __module__ so transformers v5's source-file checks
+        # (e.g. _can_set_experts_implementation) read the original
+        # model's module instead of this file.
+        SupportTorchCompileWrapper.__module__ = cls.__module__
+
         # Patch the class in its module
         module = sys.modules[cls.__module__]
         setattr(module, cls.__name__, SupportTorchCompileWrapper)
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index 7d48e3c6ff91..8f16e6d28f43 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
 from dataclasses import dataclass, field
 from functools import lru_cache
 from pathlib import Path
@@ -10,6 +11,7 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import get_config
 from vllm.transformers_utils.gguf_utils import (
     check_gguf_file,
     get_gguf_file_path_from_hf,
@@ -31,6 +33,13 @@
 logger = init_logger(__name__)
 
 
+# Model types whose hub tokenizer_class is incorrect and should be overridden with
+# TokenizersBackend (the generic fast tokenizer). Adding a model type here is always a
+# temporary workaround and better long term solutions are:
+# - Add model type to MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS in transformers (better)
+# - Fix tokenizer_class on the hub for the affected models (best)
+_MODEL_TYPES_WITH_INCORRECT_TOKENIZER_CLASS: set[str] = {"step3_vl"}
+
 _VLLM_TOKENIZERS = {
     "deepseek_v32": ("deepseek_v32", "DeepseekV32Tokenizer"),
     "grok2": ("grok2", "Grok2Tokenizer"),
@@ -202,7 +211,31 @@ def get_tokenizer(
         **kwargs,
     )
 
-    if tokenizer_cls == TokenizerLike:
+    # Ensure that, if the config were to come from vllm.transformers_utils.config, it is
+    # registered with AutoConfig before the tokenizer is loaded. This is necessary since
+    # tokenizer_cls_.from_pretrained will call AutoConfig.from_pretrained internally.
+    # This may fail for paths that don't have a model config (e.g. LoRA adapters),
+    # which is fine — those don't need custom config registration.
+    config = None
+    with contextlib.suppress(ValueError, OSError):
+        config = get_config(
+            tokenizer_name,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+        )
+
+    # Some models have an incorrect tokenizer_class on the hub.
+    # For these model types, bypass AutoTokenizer and use TokenizersBackend directly.
+    model_type = getattr(config, "model_type", None) if config else None
+    if model_type in _MODEL_TYPES_WITH_INCORRECT_TOKENIZER_CLASS:
+        from transformers.tokenization_utils_tokenizers import TokenizersBackend
+
+        logger.debug(
+            "Overriding tokenizer_class to TokenizersBackend for model_type=%r",
+            model_type,
+        )
+        tokenizer_cls_ = TokenizersBackend
+    elif tokenizer_cls == TokenizerLike:
         tokenizer_cls_ = TokenizerRegistry.load_tokenizer_cls(tokenizer_mode)
     else:
         tokenizer_cls_ = tokenizer_cls