diff --git a/requirements.txt b/requirements.txt
index d34f468a1a..d2ae236219 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,7 @@ packaging==23.2
 huggingface_hub>=1.1.7
 peft>=0.18.0
 tokenizers>=0.22.1
-transformers @ git+https://github.com/huggingface/transformers.git@main
+transformers @ git+https://github.com/huggingface/transformers.git@v5.0.0rc2
 accelerate==1.12.0
 datasets==4.4.2
 deepspeed>=0.18.3
diff --git a/setup.py b/setup.py
index 10c9a84539..1257f35331 100644
--- a/setup.py
+++ b/setup.py
@@ -64,7 +64,7 @@ def parse_requirements(extras_require_map):
 
             if (major, minor) >= (2, 9):
                 extras_require_map.pop("fbgemm-gpu")
-                extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.4.1"]
+                extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.4.2"]
                 extras_require_map["vllm"] = ["vllm==0.11.1"]
             elif (major, minor) >= (2, 8):
                 extras_require_map.pop("fbgemm-gpu")
diff --git a/tests/conftest.py b/tests/conftest.py
index 4c8c80cb7f..b542d377ba 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -83,6 +83,12 @@ def download_smollm2_135m_model():
     snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M", repo_type="model")
 
 
+@pytest.fixture(scope="session", autouse=True)
+def download_smollm2_135m_instruct_model():
+    # download the model
+    snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M-Instruct", repo_type="model")
+
+
 @pytest.fixture(scope="session", autouse=True)
 def download_smollm2_135m_gptq_model():
     # download the model
@@ -143,12 +149,20 @@ def download_argilla_distilabel_intel_orca_dpo_dataset():
     )
 
 
-# @pytest.fixture(scope="session", autouse=True)
-# def download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset():
-#     # download the dataset
-#     snapshot_download_w_retry(
-#         "argilla/ultrafeedback-binarized-preferences-cleaned", repo_type="dataset"
-#     )
+@pytest.fixture(scope="session", autouse=True)
+def download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset():
+    # download the dataset
+    snapshot_download_w_retry(
+        "argilla/ultrafeedback-binarized-preferences-cleaned", repo_type="dataset"
+    )
+
+
+@pytest.fixture(scope="session", autouse=True)
+def download_argilla_ultrafeedback_binarized_preferences_cleaned_kto_dataset():
+    # download the dataset
+    snapshot_download_w_retry(
+        "argilla/ultrafeedback-binarized-preferences-cleaned-kto", repo_type="dataset"
+    )
 
 
 # @pytest.fixture(scope="session", autouse=True)
@@ -251,7 +265,9 @@ def download_llama_1b_model_fixture():
 def download_llama3_8b_model_fixture():
     # download the tokenizer only
     snapshot_download_w_retry(
-        "NousResearch/Meta-Llama-3-8B", repo_type="model", allow_patterns=["*token*"]
+        "NousResearch/Meta-Llama-3-8B",
+        repo_type="model",
+        allow_patterns=["*token*", "config.json"],
     )
 
 
@@ -261,7 +277,7 @@ def download_llama3_8b_instruct_model_fixture():
     snapshot_download_w_retry(
         "NousResearch/Meta-Llama-3-8B-Instruct",
         repo_type="model",
-        allow_patterns=["*token*"],
+        allow_patterns=["*token*", "config.json"],
     )
 
 
@@ -269,7 +285,19 @@ def download_llama3_8b_instruct_model_fixture():
 def download_phi_35_mini_model_fixture():
     # download the tokenizer only
     snapshot_download_w_retry(
-        "microsoft/Phi-3.5-mini-instruct", repo_type="model", allow_patterns=["*token*"]
+        "microsoft/Phi-3.5-mini-instruct",
+        repo_type="model",
+        allow_patterns=["*token*", "config.json"],
+    )
+
+
+@pytest.fixture(scope="session", autouse=True)
+def download_phi_4_reasoning_model_fixture():
+    # download the tokenizer only
+    snapshot_download_w_retry(
+        "microsoft/Phi-4-reasoning",
+        repo_type="model",
+        allow_patterns=["*token*", "config.json"],
     )
 
 
@@ -279,7 +307,7 @@ def download_phi_3_medium_model_fixture():
     snapshot_download_w_retry(
         "microsoft/Phi-3-medium-128k-instruct",
         repo_type="model",
-        allow_patterns=["*token*"],
+        allow_patterns=["*token*", "config.json"],
     )
 
 
@@ -562,6 +590,8 @@ def test_load_fixtures(
     download_mhenrichsen_alpaca_2k_dataset,
     download_mhenrichsen_alpaca_2k_w_revision_dataset,
     download_mlabonne_finetome_100k_dataset,
+    download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset,
+    download_argilla_ultrafeedback_binarized_preferences_cleaned_kto_dataset,
     download_argilla_distilabel_capybara_dpo_7k_binarized_dataset,
     download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset,
     download_argilla_dpo_pairs_dataset,
@@ -573,6 +603,7 @@ def test_load_fixtures(
     download_llama3_8b_instruct_model_fixture,
     download_phi_35_mini_model_fixture,
     download_phi_3_medium_model_fixture,
+    download_phi_4_reasoning_model_fixture,
     download_mistral_7b_model_fixture,
     download_gemma_2b_model_fixture,
     download_gemma2_9b_model_fixture,
diff --git a/tests/e2e/multigpu/test_gemma3.py b/tests/e2e/multigpu/test_gemma3.py
index 3c77c7107e..4b74fcbb04 100644
--- a/tests/e2e/multigpu/test_gemma3.py
+++ b/tests/e2e/multigpu/test_gemma3.py
@@ -28,13 +28,12 @@ class TestMultiGPUGemma3:
     Test case for Gemma3 models using LoRA
     """
 
-    @pytest.mark.skip(
-        reason="broken in transformers v5 due to embeddings bug fixed in https://github.com/huggingface/transformers/pull/42558"
-    )
     def test_lora_ddp_packed(self, temp_dir):
         cfg = DictDefault(
             {
                 "base_model": "axolotl-mirrors/gemma-3-4b-pt",
+                "model_type": "Gemma3ForCausalLM",
+                "cls_model_config": "Gemma3TextConfig",
                 "sequence_len": 2048,
                 "ddp_find_unused_parameters": True,
                 "sample_packing": True,
diff --git a/tests/prompt_strategies/test_chat_templates.py b/tests/prompt_strategies/test_chat_templates.py
index 911a97922c..90e0e274b7 100644
--- a/tests/prompt_strategies/test_chat_templates.py
+++ b/tests/prompt_strategies/test_chat_templates.py
@@ -140,13 +140,13 @@ def test_phi35(self, phi35_tokenizer, assistant_dataset):
         # fmt: off
         expected_input_ids = [
             32010,  # user
-            12199, 32007,  # user eot
+            22172, 32007,  # user eot
             32001,  # assistant
-            12199, 32007,  # assistant eot
+            22172, 32007,  # assistant eot
             32010,  # user
-            16773, 26966, 32007,  # user eot
+            1781, 26966, 32007,  # user eot
             32001,  # assistant
-            16773, 26966, 32007,  # assistant eot
+            1781, 26966, 32007,  # assistant eot
         ]
         expected_labels = [
             -100,  # user
@@ -156,7 +156,7 @@ def test_phi35(self, phi35_tokenizer, assistant_dataset):
             -100,  # user
             -100, -100, -100,  # user eot
             -100,  # assistant
-            16773, 26966, 32007,  # assistant eot
+            1781, 26966, 32007,  # assistant eot
         ]
         # fmt: on
         LOG.debug(f"Expected input_ids: {expected_input_ids}")
diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py
index f308efbef3..114c2bea2d 100644
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -84,7 +84,7 @@ def test_add_additional_special_tokens(self):
             }
         )
         tokenizer = load_tokenizer(cfg)
-        assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1792]
+        assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1404]
         assert len(tokenizer) == 32001
 
         # ensure reloading the tokenizer again from cfg results in same vocab length