diff --git a/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py b/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py index f5eda9be46a7..48634fde149a 100644 --- a/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py +++ b/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py @@ -796,6 +796,7 @@ def get_text_audio_video_embeds(self, input_ids, attention_mask=None): text_outputs: MaskedLMOutput = self.text_model( input_ids=input_ids, attention_mask=attention_mask, + output_hidden_states=True, return_dict=True, ) text_embeds = text_outputs.hidden_states[-1][:, 0] @@ -851,6 +852,7 @@ def get_audio_plus_text_embeds( text_outputs: MaskedLMOutput = self.text_model( input_ids=input_ids, attention_mask=attention_mask, + output_hidden_states=True, return_dict=True, ) text_embeds = text_outputs.hidden_states[-1][:, 0] @@ -873,6 +875,7 @@ def get_video_plus_text_embeds( text_outputs: MaskedLMOutput = self.text_model( input_ids=input_ids, attention_mask=attention_mask, + output_hidden_states=True, return_dict=True, ) text_embeds = text_outputs.hidden_states[-1][:, 0] @@ -893,7 +896,10 @@ def forward( **kwargs, ) -> PeAudioVideoOutput: if sum([input_ids is not None, pixel_values_videos is not None, input_values is not None]) < 2: - raise ValueError("At least two of input_ids, pixel_values_videos, or input_values must be provided") + raise ValueError( + "At least two of input_ids, pixel_values_videos, or input_values must be provided. " + "For encoding individual modalities, get_*_embeds methods are available." + ) if pixel_values_videos is None: outputs = self.audio_model( diff --git a/src/transformers/models/pe_audio_video/modular_pe_audio_video.py b/src/transformers/models/pe_audio_video/modular_pe_audio_video.py index fc0d353585fb..649a5da8811e 100644 --- a/src/transformers/models/pe_audio_video/modular_pe_audio_video.py +++ b/src/transformers/models/pe_audio_video/modular_pe_audio_video.py @@ -588,6 +588,7 @@ def get_text_audio_video_embeds(self, input_ids, attention_mask=None): text_outputs: MaskedLMOutput = self.text_model( input_ids=input_ids, attention_mask=attention_mask, + output_hidden_states=True, return_dict=True, ) text_embeds = text_outputs.hidden_states[-1][:, 0] @@ -643,6 +644,7 @@ def get_audio_plus_text_embeds( text_outputs: MaskedLMOutput = self.text_model( input_ids=input_ids, attention_mask=attention_mask, + output_hidden_states=True, return_dict=True, ) text_embeds = text_outputs.hidden_states[-1][:, 0] @@ -665,6 +667,7 @@ def get_video_plus_text_embeds( text_outputs: MaskedLMOutput = self.text_model( input_ids=input_ids, attention_mask=attention_mask, + output_hidden_states=True, return_dict=True, ) text_embeds = text_outputs.hidden_states[-1][:, 0] @@ -685,7 +688,10 @@ def forward( **kwargs, ) -> PeAudioVideoOutput: if sum([input_ids is not None, pixel_values_videos is not None, input_values is not None]) < 2: - raise ValueError("At least two of input_ids, pixel_values_videos, or input_values must be provided") + raise ValueError( + "At least two of input_ids, pixel_values_videos, or input_values must be provided. " + "For encoding individual modalities, get_*_embeds methods are available." + ) if pixel_values_videos is None: outputs = self.audio_model( diff --git a/src/transformers/models/pe_audio_video/processing_pe_audio_video.py b/src/transformers/models/pe_audio_video/processing_pe_audio_video.py index a03b213b10c5..194abec2bf03 100644 --- a/src/transformers/models/pe_audio_video/processing_pe_audio_video.py +++ b/src/transformers/models/pe_audio_video/processing_pe_audio_video.py @@ -15,10 +15,8 @@ class PeAudioVideoProcessor(ProcessorMixin): - attributes = ["feature_extractor", "video_processor", "tokenizer"] - feature_extractor_class = "PeAudioFeatureExtractor" - tokenizer_class = "AutoTokenizer" - video_processor_class = "PeVideoVideoProcessor" + def __init__(self, feature_extractor=None, video_processor=None, tokenizer=None, **kwargs): + super().__init__(feature_extractor, video_processor, tokenizer, **kwargs) __all__ = ["PeAudioVideoProcessor"]