diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 5fd8ff53b2a9..bd4feaf7c6a7 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2246,7 +2246,7 @@ def set_encoder(self, encoder, modality: str | None = None): # NOTE: new models need to use existing names for layers if possible, so this list doesn't grow infinitely if modality in ["image", "video"]: possible_module_names = ["vision_tower", "visual", "vision_model", "vision_encoder", "image_tower"] - if modality == "audio": + elif modality == "audio": possible_module_names = ["audio_tower", "audio_encoder"] elif modality is None: possible_module_names = ["text_encoder", "encoder"]