huggingface · zucchini-nlp · May 12, 2026 · May 10, 2026 · May 10, 2026 · May 10, 2026
diff --git a/docs/source/en/model_doc/pe_audio_video.md b/docs/source/en/model_doc/pe_audio_video.md
@@ -13,21 +13,24 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 
 -->
-*This model was released on {release_date} and added to Hugging Face Transformers on 2025-12-16.*
+*This model was released on 2025-12-16 and added to Hugging Face Transformers on 2025-12-16.*
 
-# PE Audio Video (Perception Encoder Audio-Video)
+# PE Audio-Visual (Perception Encoder Audio-Visual)
 
 ## Overview
 
-TODO
+Perception Encoder Audio-Visual (PE-AV) was proposed in [Pushing the Frontier of Audiovisual Perception with Large-Scale Multimodal Correspondence Learning](https://huggingface.co/papers/2512.19687) by Apoorv Vyas et al. It extends the Perception Encoder framework to multiple modalities (text, audio, and video).
 
-## Usage
+PE-AV is a family of encoders trained on O(100M) audio-video pairs with synthetic captions, using ten pairwise contrastive objectives to align all three modalities in a shared embedding space.
 
-### Basic usage
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/pe-av.png"
+alt="PE-AV architecture" width="600"/>
 
-```py
-TODO
-```
+<small>PE-AV architecture. Taken from the <a href="https://ai.meta.com/blog/sam-audio/">Meta AI blog post.</a></small>
+
+Audio and video are processed by dedicated encoders and combined through an Audio-Visual Fusion Encoder, while captions pass through a separate Text Encoder. Each branch produces CLS embeddings: `CLS-A`, `CLS-V`, `CLS-AV` for the three modality views, and `CLS-AT`, `CLS-VT`, `CLS-AVT` for text-projection variants aligned to each target. The model is trained with a combination of single-modality and fused-modality alignment losses.
+
+See the [PE-AV collection](https://huggingface.co/collections/facebook/perception-encoder-audio-visual) on the Hub for the full checkpoint family and usage examples.
 
 ## PeAudioVideoProcessor
 

diff --git a/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py b/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py
@@ -796,6 +796,7 @@ def get_text_audio_video_embeds(self, input_ids, attention_mask=None):
         text_outputs: MaskedLMOutput = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            output_hidden_states=True,
             return_dict=True,
         )
         text_embeds = text_outputs.hidden_states[-1][:, 0]
@@ -851,6 +852,7 @@ def get_audio_plus_text_embeds(
         text_outputs: MaskedLMOutput = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            output_hidden_states=True,
             return_dict=True,
         )
         text_embeds = text_outputs.hidden_states[-1][:, 0]
@@ -873,6 +875,7 @@ def get_video_plus_text_embeds(
         text_outputs: MaskedLMOutput = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            output_hidden_states=True,
             return_dict=True,
         )
         text_embeds = text_outputs.hidden_states[-1][:, 0]
@@ -893,7 +896,10 @@ def forward(
         **kwargs,
     ) -> PeAudioVideoOutput:
         if sum([input_ids is not None, pixel_values_videos is not None, input_values is not None]) < 2:
-            raise ValueError("At least two of input_ids, pixel_values_videos, or input_values must be provided")
+            raise ValueError(
+                "At least two of input_ids, pixel_values_videos, or input_values must be provided. "
+                "For encoding individual modalities, get_*_embeds methods are available."
+            )
 
         if pixel_values_videos is None:
             outputs = self.audio_model(

diff --git a/src/transformers/models/pe_audio_video/modular_pe_audio_video.py b/src/transformers/models/pe_audio_video/modular_pe_audio_video.py
@@ -588,6 +588,7 @@ def get_text_audio_video_embeds(self, input_ids, attention_mask=None):
         text_outputs: MaskedLMOutput = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            output_hidden_states=True,
             return_dict=True,
         )
         text_embeds = text_outputs.hidden_states[-1][:, 0]
@@ -643,6 +644,7 @@ def get_audio_plus_text_embeds(
         text_outputs: MaskedLMOutput = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            output_hidden_states=True,
             return_dict=True,
         )
         text_embeds = text_outputs.hidden_states[-1][:, 0]
@@ -665,6 +667,7 @@ def get_video_plus_text_embeds(
         text_outputs: MaskedLMOutput = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            output_hidden_states=True,
             return_dict=True,
         )
         text_embeds = text_outputs.hidden_states[-1][:, 0]
@@ -685,7 +688,10 @@ def forward(
         **kwargs,
     ) -> PeAudioVideoOutput:
         if sum([input_ids is not None, pixel_values_videos is not None, input_values is not None]) < 2:
-            raise ValueError("At least two of input_ids, pixel_values_videos, or input_values must be provided")
+            raise ValueError(
+                "At least two of input_ids, pixel_values_videos, or input_values must be provided. "
+                "For encoding individual modalities, get_*_embeds methods are available."
+            )
 
         if pixel_values_videos is None:
             outputs = self.audio_model(

diff --git a/src/transformers/models/pe_audio_video/processing_pe_audio_video.py b/src/transformers/models/pe_audio_video/processing_pe_audio_video.py
@@ -15,10 +15,8 @@
 
 
 class PeAudioVideoProcessor(ProcessorMixin):
-    attributes = ["feature_extractor", "video_processor", "tokenizer"]
-    feature_extractor_class = "PeAudioFeatureExtractor"
-    tokenizer_class = "AutoTokenizer"
-    video_processor_class = "PeVideoVideoProcessor"
+    def __init__(self, feature_extractor=None, video_processor=None, tokenizer=None, **kwargs):
+        super().__init__(feature_extractor, video_processor, tokenizer, **kwargs)
 
 
 __all__ = ["PeAudioVideoProcessor"]