huggingface · githubnemo · Jun 2, 2025 · May 26, 2025 · May 27, 2025
diff --git a/src/peft/mapping_func.py b/src/peft/mapping_func.py
@@ -109,6 +109,8 @@ def get_peft_model(
         # note: PeftMixedModel does not support autocast_adapter_dtype, so don't pass it
         return PeftMixedModel(model, peft_config, adapter_name=adapter_name)
 
+    # We explicitly exclude prompt learning here since prompt learning is specific to the task and needs special
+    # handling in the PEFT model's forward method.
     if peft_config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not peft_config.is_prompt_learning:
         return PeftModel(
             model,

diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
@@ -605,9 +605,15 @@ def _setup_prompt_encoder(self, adapter_name: str):
                 # For reference refer to issue: https://github.com/huggingface/peft/issues/996
                 deepspeed_distributed_tensor_shape = getattr(value, "ds_shape", None)
 
-                if value.shape[0] == self.base_model.config.vocab_size or (
+                # Handle VLM case with separate text and vision configs
+                if "text_config" in self.base_model.config:
+                    vocab_size = self.base_model.config.text_config.vocab_size
+                else:
+                    vocab_size = self.base_model.config.vocab_size
+
+                if value.shape[0] == vocab_size or (
                     deepspeed_distributed_tensor_shape is not None
-                    and deepspeed_distributed_tensor_shape[0] == self.base_model.config.vocab_size
+                    and deepspeed_distributed_tensor_shape[0] == vocab_size
                 ):
                     word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", ""))
                     break

diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
@@ -891,6 +891,10 @@ def check_adapter_name(adapter_name):
 
 
 def _prepare_prompt_learning_config(peft_config, model_config):
+    # In case of VLM we focus on the language model portion of the model.
+    if "text_config" in model_config:
+        model_config = model_config["text_config"]
+
     if peft_config.num_layers is None:
         if "num_hidden_layers" in model_config:
             num_layers = model_config["num_hidden_layers"]

diff --git a/tests/test_vision_models.py b/tests/test_vision_models.py
@@ -67,11 +67,11 @@ def test_past_kv(self):
         )
         processor = AutoProcessor.from_pretrained(model_id)
         raw_image = np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)
-        inputs = processor(prompt, raw_image, return_tensors="pt")
+        inputs = processor(text=prompt, images=raw_image, return_tensors="pt")
 
         # get peft model
         peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20)
-        model.language_model = get_peft_model(model.language_model, peft_config)
+        model = get_peft_model(model, peft_config)
         # check that this does not raise
         model(**inputs, output_hidden_states=True)