Address changes in transformers VLM architecture (#2554)

csqaiub · csqaiub · commit 079e45abdc2e · 2025-06-02T11:53:44.000+02:00
[transformers PR #37033](huggingface/transformers#37033) re-arranges the way visual language models are built by moving the LM head from the language model to the top-level VLM (among other things). This breaks the following test: ``` peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20) model.language_model = get_peft_model(model.language_model, peft_config) ``` Reason being that all soft-prompting methods need a task type since each task type has specific handling of the soft prompt (e.g., padding the labels accordingo to the number of virtual tokens for causal LM). We also can't simply use `task_type='FEATURE_EXTRACTION'` as this would not deal with `labels` either. Luckily the VLM is almost behaving like a LM (e.g., `get_input_embeddings` refers to the underlying LM), therefore we can target the VLM itself and need to have the soft prompt methods detect if we're fine-tuning a VLM so that we take the respective config variables from the `base_model.text_config` instead of `base_model` directly.
diff --git a/src/peft/mapping_func.py b/src/peft/mapping_func.py
@@ -109,6 +109,8 @@ def get_peft_model(
         # note: PeftMixedModel does not support autocast_adapter_dtype, so don't pass it
         return PeftMixedModel(model, peft_config, adapter_name=adapter_name)
 
+    # We explicitly exclude prompt learning here since prompt learning is specific to the task and needs special
+    # handling in the PEFT model's forward method.
     if peft_config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not peft_config.is_prompt_learning:
         return PeftModel(
             model,
diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
@@ -605,9 +605,15 @@ def _setup_prompt_encoder(self, adapter_name: str):
                 # For reference refer to issue: https://github.com/huggingface/peft/issues/996
                 deepspeed_distributed_tensor_shape = getattr(value, "ds_shape", None)
 
-                if value.shape[0] == self.base_model.config.vocab_size or (
+                # Handle VLM case with separate text and vision configs
+                if "text_config" in self.base_model.config:
+                    vocab_size = self.base_model.config.text_config.vocab_size
+                else:
+                    vocab_size = self.base_model.config.vocab_size
+
+                if value.shape[0] == vocab_size or (
                     deepspeed_distributed_tensor_shape is not None
-                    and deepspeed_distributed_tensor_shape[0] == self.base_model.config.vocab_size
+                    and deepspeed_distributed_tensor_shape[0] == vocab_size
                 ):
                     word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", ""))
                     break
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
@@ -891,6 +891,10 @@ def check_adapter_name(adapter_name):
 
 
 def _prepare_prompt_learning_config(peft_config, model_config):
+    # In case of VLM we focus on the language model portion of the model.
+    if "text_config" in model_config:
+        model_config = model_config["text_config"]
+
     if peft_config.num_layers is None:
         if "num_hidden_layers" in model_config:
             num_layers = model_config["num_hidden_layers"]
diff --git a/tests/test_vision_models.py b/tests/test_vision_models.py
@@ -67,11 +67,11 @@ def test_past_kv(self):
         )
         processor = AutoProcessor.from_pretrained(model_id)
         raw_image = np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)
-        inputs = processor(prompt, raw_image, return_tensors="pt")
+        inputs = processor(text=prompt, images=raw_image, return_tensors="pt")
 
         # get peft model
         peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20)
-        model.language_model = get_peft_model(model.language_model, peft_config)
+        model = get_peft_model(model, peft_config)
         # check that this does not raise
         model(**inputs, output_hidden_states=True)