Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,7 @@ def get_text_audio_video_embeds(self, input_ids, attention_mask=None):
text_outputs: MaskedLMOutput = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=True,
return_dict=True,
)
text_embeds = text_outputs.hidden_states[-1][:, 0]
Expand Down Expand Up @@ -851,6 +852,7 @@ def get_audio_plus_text_embeds(
text_outputs: MaskedLMOutput = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=True,
return_dict=True,
)
text_embeds = text_outputs.hidden_states[-1][:, 0]
Expand All @@ -873,6 +875,7 @@ def get_video_plus_text_embeds(
text_outputs: MaskedLMOutput = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=True,
return_dict=True,
)
text_embeds = text_outputs.hidden_states[-1][:, 0]
Expand All @@ -893,7 +896,10 @@ def forward(
**kwargs,
) -> PeAudioVideoOutput:
if sum([input_ids is not None, pixel_values_videos is not None, input_values is not None]) < 2:
raise ValueError("At least two of input_ids, pixel_values_videos, or input_values must be provided")
raise ValueError(
"At least two of input_ids, pixel_values_videos, or input_values must be provided. "
"For encoding individual modalities, get_*_embeds methods are available."
)

if pixel_values_videos is None:
outputs = self.audio_model(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,7 @@ def get_text_audio_video_embeds(self, input_ids, attention_mask=None):
text_outputs: MaskedLMOutput = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=True,
return_dict=True,
)
text_embeds = text_outputs.hidden_states[-1][:, 0]
Expand Down Expand Up @@ -643,6 +644,7 @@ def get_audio_plus_text_embeds(
text_outputs: MaskedLMOutput = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=True,
return_dict=True,
)
text_embeds = text_outputs.hidden_states[-1][:, 0]
Expand All @@ -665,6 +667,7 @@ def get_video_plus_text_embeds(
text_outputs: MaskedLMOutput = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=True,
return_dict=True,
)
text_embeds = text_outputs.hidden_states[-1][:, 0]
Expand All @@ -685,7 +688,10 @@ def forward(
**kwargs,
) -> PeAudioVideoOutput:
if sum([input_ids is not None, pixel_values_videos is not None, input_values is not None]) < 2:
raise ValueError("At least two of input_ids, pixel_values_videos, or input_values must be provided")
raise ValueError(
"At least two of input_ids, pixel_values_videos, or input_values must be provided. "
"For encoding individual modalities, get_*_embeds methods are available."
)

if pixel_values_videos is None:
outputs = self.audio_model(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@


class PeAudioVideoProcessor(ProcessorMixin):
attributes = ["feature_extractor", "video_processor", "tokenizer"]
feature_extractor_class = "PeAudioFeatureExtractor"
tokenizer_class = "AutoTokenizer"
video_processor_class = "PeVideoVideoProcessor"
def __init__(self, feature_extractor=None, video_processor=None, tokenizer=None, **kwargs):
super().__init__(feature_extractor, video_processor, tokenizer, **kwargs)
Comment on lines +18 to +19

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

niiice!



__all__ = ["PeAudioVideoProcessor"]
Loading