diff --git a/requirements/test.in b/requirements/test.in
index 3be580db0674..c5d2c4cd4c30 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -34,7 +34,7 @@ num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
-transformers==4.51.1
+transformers==4.51.3
 tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
 schemathesis>=3.39.15 # Required for openai schema test.
diff --git a/requirements/test.txt b/requirements/test.txt
index 6dcd4ff01460..9642a5bfe68d 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -737,7 +737,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.51.1
+transformers==4.51.3
     # via
     #   -r requirements/test.in
     #   genai-perf
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 79fa3fa99773..85714b85e7eb 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -9,6 +9,7 @@
 
 from vllm.platforms import current_platform
 
+from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 
 # These have unsupported head_dim for FA. We do not
@@ -33,54 +34,50 @@
 
 # @maybe_test_rocm_aiter
 @pytest.mark.parametrize(
-    "model",
+    "model_arch",
     [
         pytest.param(
-            "bigscience/bloom-560m",  # bloom - testing alibi slopes
+            "BloomForCausalLM",  # testing alibi slopes
             marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
         pytest.param(
-            "openai-community/gpt2",  # gpt2
+            "GPT2LMHeadModel",  # gpt2
             marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
-        pytest.param("Milos/slovak-gpt-j-405M"),  # gptj
-        pytest.param("bigcode/tiny_starcoder_py"),  # gpt_bigcode
-        pytest.param("EleutherAI/pythia-70m"),  # gpt_neox
+        pytest.param("GPTJForCausalLM"),
+        pytest.param("GPTBigCodeForCausalLM"),
+        pytest.param("GPTNeoXForCausalLM"),
         pytest.param(
-            "google/gemma-1.1-2b-it",  # gemma
+            "GemmaForCausalLM",  # gemma
             marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
+        pytest.param("GlmForCausalLM"),
         pytest.param(
-            "THUDM/chatglm3-6b",  # chatglm (text-only)
-        ),
-        pytest.param(
-            "meta-llama/Llama-3.2-1B-Instruct",  # llama
+            "LlamaForCausalLM",
             marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
         pytest.param(
-            "openbmb/MiniCPM3-4B",
+            "MiniCPM3ForCausalLM",
             # fused_moe not supported on CPU
             marks=[pytest.mark.core_model],
         ),
         pytest.param(
-            "facebook/opt-125m",  # opt
+            "OPTForCausalLM",
             marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
         pytest.param(
-            "microsoft/phi-2",  # phi
+            "PhiForCausalLM",
             marks=[pytest.mark.core_model],
         ),
+        pytest.param("QWenLMHeadModel", ),
         pytest.param(
-            "Qwen/Qwen-7B",  # qwen (text-only)
-        ),
-        pytest.param(
-            "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
+            "Qwen2ForCausalLM",
             marks=[pytest.mark.core_model],
         ),
-        pytest.param("stabilityai/stablelm-3b-4e1t"),  # stablelm
-        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
+        pytest.param("StableLmForCausalLM"),
+        pytest.param("Starcoder2ForCausalLM"),
         pytest.param(
-            "ehristoforu/Falcon3-MoE-2x7B-Insruct",  # mixtral
+            "MixtralForCausalLM",
             marks=[pytest.mark.cpu_model],
         )
     ])
@@ -89,10 +86,12 @@
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize(
     "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+def test_models(hf_runner, vllm_runner, example_prompts, model_arch: str,
                 dtype: str, max_tokens: int, num_logprobs: int,
                 use_rocm_aiter: bool, monkeypatch) -> None:
 
+    model = HF_EXAMPLE_MODELS.get_hf_info(model_arch).default
+
     if model in REQUIRES_V0:
         monkeypatch.setenv("VLLM_USE_V1", "0")
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index c15ae3619844..6b1ec64115e3 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -123,7 +123,8 @@ def check_available_online(
     "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B"),
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
     "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
-                                    trust_remote_code=True),
+                                    trust_remote_code=True,
+                                    max_transformers_version="4.51.1"),
     "ChatGLMForConditionalGeneration": _HfExamplesInfo("thu-coai/ShieldLM-6B-chatglm3",  # noqa: E501
                                                        trust_remote_code=True),
     "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",