diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
index b0ea3869b5..d642282787 100644
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@@ -209,6 +209,7 @@ def _create_and_replace(
             "target_name": current_key,
             "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False),
             "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False),
+            "ephemeral_gpu_offload": lora_config.runtime_config.ephemeral_gpu_offload,
             "parameter_name": parameter_name,
         }
 
diff --git a/tests/test_common_gpu.py b/tests/test_common_gpu.py
index cac1a32128..a8a4c58c4d 100644
--- a/tests/test_common_gpu.py
+++ b/tests/test_common_gpu.py
@@ -64,6 +64,7 @@
     load_cat_image,
     require_bitsandbytes,
     require_deterministic_for_xpu,
+    require_gptqmodel,
     require_non_cpu,
     require_torch_multi_accelerator,
 )
@@ -519,6 +520,7 @@ def test_ia3_bnb_quantization_from_pretrained_safetensors(self, quantization):
             assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.ia3_l
             assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.ia3_l
 
+    @require_gptqmodel
     @pytest.mark.single_gpu_tests
     def test_lora_gptq_quantization_from_pretrained_safetensors(self):
         r"""
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index 53a7ae5bd1..cf431f7f69 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -1365,6 +1365,10 @@ def __init__(self, emb_size=100):
         super().__init__()
         self.emb = nn.Embedding(emb_size, 5)
         self.conv1d = Conv1D(1, 5)
+        # make sure that we have a good signal-to-noise ratio
+        # since apparently CUDA ReLU clips the gradient at a
+        # certain point.
+        self.conv1d.weight.data += 10
         self.relu = nn.ReLU()
         self.flat = nn.Flatten()
         self.lin0 = nn.Linear(10, 2)
diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
index 909942c9e6..71ccc928a8 100644
--- a/tests/test_gpu_examples.py
+++ b/tests/test_gpu_examples.py
@@ -592,7 +592,8 @@ def test_seq2seq_lm_training_single_gpu(self):
                 device_map={"": 0},
             )
 
-            assert set(model.hf_device_map.values()) == {0}
+            # note: transformers v5 doesn't set the device map if there's only one device
+            assert not hasattr(model.hf_device_map) or set(model.hf_device_map.values()) == {0}
 
             tokenizer = AutoTokenizer.from_pretrained(self.seq2seq_model_id)
             model = prepare_model_for_kbit_training(model)