From c6b06b513665e52b27053e058d512c2997a92245 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 30 Jan 2023 14:14:25 +0000
Subject: [PATCH 001/197] add model like clip

---
 README.md                                     |    1 +
 README_es.md                                  |   35 +-
 README_hd.md                                  |   35 +-
 README_ja.md                                  |   35 +-
 README_ko.md                                  |   35 +-
 README_zh-hans.md                             |   35 +-
 README_zh-hant.md                             |   35 +-
 docs/source/en/index.mdx                      |    2 +
 docs/source/en/model_doc/clap.mdx             |   96 ++
 docs/source/en/serialization.mdx              |    1 +
 src/transformers/__init__.py                  |   40 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 .../models/auto/feature_extraction_auto.py    |    1 +
 .../models/auto/image_processing_auto.py      |    1 +
 src/transformers/models/auto/modeling_auto.py |    2 +
 .../models/auto/processing_auto.py            |    1 +
 .../models/auto/tokenization_auto.py          |    7 +
 src/transformers/models/clap/__init__.py      |  121 ++
 .../models/clap/configuration_clap.py         |  402 +++++
 .../convert_clap_original_pytorch_to_hf.py    |  148 ++
 .../models/clap/feature_extraction_clap.py    |   33 +
 .../models/clap/image_processing_clap.py      |  338 +++++
 src/transformers/models/clap/modeling_clap.py | 1335 +++++++++++++++++
 .../models/clap/processing_clap.py            |  146 ++
 .../models/clap/tokenization_clap.py          |  523 +++++++
 .../models/clap/tokenization_clap_fast.py     |  173 +++
 src/transformers/utils/dummy_pt_objects.py    |   45 +
 .../utils/dummy_tokenizers_objects.py         |    7 +
 .../utils/dummy_vision_objects.py             |   14 +
 tests/models/clap/__init__.py                 |    0
 .../models/clap/test_image_processing_clap.py |  305 ++++
 tests/models/clap/test_modeling_clap.py       |  737 +++++++++
 tests/models/clap/test_processor_clap.py      |  202 +++
 tests/models/clap/test_tokenization_clap.py   |  186 +++
 35 files changed, 4979 insertions(+), 102 deletions(-)
 create mode 100644 docs/source/en/model_doc/clap.mdx
 create mode 100644 src/transformers/models/clap/__init__.py
 create mode 100644 src/transformers/models/clap/configuration_clap.py
 create mode 100644 src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
 create mode 100644 src/transformers/models/clap/feature_extraction_clap.py
 create mode 100644 src/transformers/models/clap/image_processing_clap.py
 create mode 100644 src/transformers/models/clap/modeling_clap.py
 create mode 100644 src/transformers/models/clap/processing_clap.py
 create mode 100644 src/transformers/models/clap/tokenization_clap.py
 create mode 100644 src/transformers/models/clap/tokenization_clap_fast.py
 create mode 100644 tests/models/clap/__init__.py
 create mode 100644 tests/models/clap/test_image_processing_clap.py
 create mode 100644 tests/models/clap/test_modeling_clap.py
 create mode 100644 tests/models/clap/test_processor_clap.py
 create mode 100644 tests/models/clap/test_tokenization_clap.py

diff --git a/README.md b/README.md
index 71bdd607fbda..97864251c155 100644
--- a/README.md
+++ b/README.md
@@ -294,6 +294,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_es.md b/README_es.md
index 6552f843978c..5d9690fc856d 100644
--- a/README_es.md
+++ b/README_es.md
@@ -264,7 +264,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 🤗 Transformers actualmente proporciona las siguientes arquitecturas (ver [aquí](https://huggingface.co/docs/transformers/model_summary) para un resumen de alto nivel de cada uno de ellas.):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
@@ -275,11 +275,11 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
@@ -287,6 +287,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
@@ -310,7 +311,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
@@ -320,7 +321,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
@@ -328,8 +329,8 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. 
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 
-1. **[Graphormer](https://huggingface.co/docs/transformers/main/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
@@ -350,7 +351,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/main/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
@@ -368,7 +369,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OneFormer](https://huggingface.co/docs/transformers/main/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
@@ -386,8 +387,8 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
@@ -398,28 +399,28 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)**  (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[UPerNet](https://huggingface.co/docs/transformers/main/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_hd.md b/README_hd.md
index e2bdfe7cfb03..d37eab25253e 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -236,7 +236,7 @@ conda install -c huggingface transformers
 🤗 ट्रांसफॉर्मर वर्तमान में निम्नलिखित आर्किटेक्चर का समर्थन करते हैं (मॉडल के अवलोकन के लिए [यहां] देखें (https://huggingface.co/docs/transformers/model_summary))：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (Google Research and the Toyota Technological Institute at Chicago) साथ थीसिस [ALBERT: A Lite BERT for Self-supervised भाषा प्रतिनिधित्व सीखना](https://arxiv.org/abs/1909.11942), झेंझोंग लैन, मिंगदा चेन, सेबेस्टियन गुडमैन, केविन गिम्पेल, पीयूष शर्मा, राडू सोरिकट
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (फेसबुक) साथ थीसिस [बार्ट: प्राकृतिक भाषा निर्माण, अनुवाद के लिए अनुक्रम-से-अनुक्रम पूर्व प्रशिक्षण , और समझ] (https://arxiv.org/pdf/1910.13461.pdf) पर निर्भर माइक लुईस, यिनहान लियू, नमन गोयल, मार्जन ग़ज़विनिनेजाद, अब्देलरहमान मोहम्मद, ओमर लेवी, वेस स्टोयानोव और ल्यूक ज़ेटलमॉयर
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (से École polytechnique) साथ थीसिस [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) पर निर्भर Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis रिहाई।
@@ -247,11 +247,11 @@ conda install -c huggingface transformers
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research से) साथ में पेपर [BERTweet: अंग्रेजी ट्वीट्स के लिए एक पूर्व-प्रशिक्षित भाषा मॉडल] (https://aclanthology.org/2020.emnlp-demos.2/) डाट क्वोक गुयेन, थान वु और अन्ह तुआन गुयेन द्वारा प्रकाशित।
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (गूगल रिसर्च से) साथ वाला पेपर [बिग बर्ड: ट्रांसफॉर्मर्स फॉर लॉन्गर सीक्वेंस](https://arxiv .org/abs/2007.14062) मंज़िल ज़हीर, गुरु गुरुगणेश, अविनावा दुबे, जोशुआ आइंस्ली, क्रिस अल्बर्टी, सैंटियागो ओंटानोन, फिलिप फाम, अनिरुद्ध रावुला, किफ़ान वांग, ली यांग, अमर अहमद द्वारा।
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (गूगल रिसर्च से) साथ में पेपर [बिग बर्ड: ट्रांसफॉर्मर्स फॉर लॉन्गर सीक्वेंस](https://arxiv.org/abs/2007.14062) मंज़िल ज़हीर, गुरु गुरुगणेश, अविनावा दुबे, जोशुआ आइंस्ली, क्रिस अल्बर्टी, सैंटियागो ओंटानन, फिलिप फाम द्वारा , अनिरुद्ध रावुला, किफ़ान वांग, ली यांग, अमर अहमद द्वारा पोस्ट किया गया।
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (फेसबुक से) साथ में कागज [एक ओपन-डोमेन चैटबॉट बनाने की विधि](https://arxiv.org /abs/2004.13637) स्टीफन रोलर, एमिली दीनन, नमन गोयल, दा जू, मैरी विलियमसन, यिनहान लियू, जिंग जू, मायल ओट, कर्ट शस्टर, एरिक एम। स्मिथ, वाई-लैन बॉरो, जेसन वेस्टन द्वारा।
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (फेसबुक से) साथ में पेपर [एक ओपन-डोमेन चैटबॉट बनाने की रेसिपी](https://arxiv .org/abs/2004.13637) स्टीफन रोलर, एमिली दीनन, नमन गोयल, दा जू, मैरी विलियमसन, यिनहान लियू, जिंग जू, मायल ओट, कर्ट शस्टर, एरिक एम स्मिथ, वाई-लैन बॉरो, जेसन वेस्टन द्वारा।
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (एलेक्सा से) कागज के साथ [बीईआरटी के लिए ऑप्टिमल सबआर्किटेक्चर एक्सट्रैक्शन](https://arxiv.org/abs/ 2010.10499) एड्रियन डी विंटर और डैनियल जे पेरी द्वारा।
 1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (हरबिन इंस्टिट्यूट ऑफ़ टेक्नोलॉजी/माइक्रोसॉफ्ट रिसर्च एशिया/इंटेल लैब्स से) कागज के साथ [ब्रिजटॉवर: विजन-लैंग्वेज रिप्रेजेंटेशन लर्निंग में एनकोडर्स के बीच ब्रिज बनाना](<https://arxiv.org/abs/2206.08657>) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
@@ -259,6 +259,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (इनरिया/फेसबुक/सोरबोन से) साथ में कागज [CamemBERT: एक टेस्टी फ्रेंच लैंग्वेज मॉडल](https:// arxiv.org/abs/1911.03894) लुई मार्टिन*, बेंजामिन मुलर*, पेड्रो जेवियर ऑर्टिज़ सुआरेज़*, योआन ड्यूपॉन्ट, लॉरेंट रोमरी, एरिक विलेमोन्टे डे ला क्लर्जरी, जैमे सेडाह और बेनोइट सगोट द्वारा।
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा।
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा।
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (सेल्सफोर्स से) साथ में पेपर [प्रोग्राम सिंथेसिस के लिए एक संवादात्मक प्रतिमान](https://arxiv.org/abs/2203.13474) एरिक निजकैंप, बो पैंग, हिरोआकी हयाशी, लिफू तू, हुआन वांग, यिंगबो झोउ, सिल्वियो सावरेस, कैमिंग जिओंग रिलीज।
@@ -282,7 +283,7 @@ conda install -c huggingface transformers
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER से) साथ में कागज [OCR-मुक्त डॉक्यूमेंट अंडरस्टैंडिंग ट्रांसफॉर्मर](https://arxiv.org/abs /2111.15664) गीवूक किम, टीकग्यू होंग, मूनबिन यिम, जियोंग्योन नाम, जिनयॉन्ग पार्क, जिनयॉन्ग यिम, वोनसेओक ह्वांग, सांगडू यूं, डोंगयून हान, सेउंग्युन पार्क द्वारा।
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (फेसबुक से) साथ में पेपर [ओपन-डोमेन क्वेश्चन आंसरिंग के लिए डेंस पैसेज रिट्रीवल](https://arxiv. org/abs/2004.04906) व्लादिमीर करपुखिन, बरलास ओज़ुज़, सेवन मिन, पैट्रिक लुईस, लेडेल वू, सर्गेई एडुनोव, डैनकी चेन, और वेन-ताऊ यिह द्वारा।
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (इंटेल लैब्स से) साथ में कागज [विज़न ट्रांसफॉर्मर्स फॉर डेंस प्रेडिक्शन](https://arxiv.org /abs/2103.13413) रेने रैनफ्टल, एलेक्सी बोचकोवस्की, व्लादलेन कोल्टन द्वारा।
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google रिसर्च/स्टैनफोर्ड यूनिवर्सिटी से) साथ में दिया गया पेपर [इलेक्ट्रा: जेनरेटर के बजाय भेदभाव करने वाले के रूप में टेक्स्ट एन्कोडर्स का पूर्व-प्रशिक्षण] (https://arxiv.org/abs/2003.10555) केविन क्लार्क, मिन्ह-थांग लुओंग, क्वोक वी. ले, क्रिस्टोफर डी. मैनिंग द्वारा पोस्ट किया गया।
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google रिसर्च से) साथ में दिया गया पेपर [सीक्वेंस जेनरेशन टास्क के लिए प्री-ट्रेंड चेकपॉइंट का इस्तेमाल करना](https:/ /arxiv.org/abs/1907.12461) साशा रोठे, शशि नारायण, अलियाक्सि सेवेरिन द्वारा।
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)**(Baidu से) साथ देने वाला पेपर [ERNIE: एन्हांस्ड रिप्रेजेंटेशन थ्रू नॉलेज इंटीग्रेशन](https://arxiv.org/abs/1904.09223) यू सन, शुओहुआन वांग, युकुन ली, शिकुन फेंग, ज़ुई चेन, हान झांग, शिन तियान, डैनक्सियांग झू, हाओ तियान, हुआ वू द्वारा पोस्ट किया गया।
@@ -292,7 +293,7 @@ conda install -c huggingface transformers
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (FLAVA: A फाउंडेशनल लैंग्वेज एंड विजन अलाइनमेंट मॉडल) (https://arxiv) साथ वाला पेपर .org/abs/2112.04482) अमनप्रीत सिंह, रोंगहांग हू, वेदानुज गोस्वामी, गुइल्यूम कुएरॉन, वोज्शिएक गालुबा, मार्कस रोहरबैक, और डौवे कीला द्वारा।
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (गूगल रिसर्च से) साथ वाला पेपर [FNet: मिक्सिंग टोकन विद फूरियर ट्रांसफॉर्म्स](https://arxiv.org /abs/2105.03824) जेम्स ली-थॉर्प, जोशुआ आइंस्ली, इल्या एकस्टीन, सैंटियागो ओंटानन द्वारा।
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (सीएमयू/गूगल ब्रेन से) साथ में कागज [फ़नल-ट्रांसफॉर्मर: कुशल भाषा प्रसंस्करण के लिए अनुक्रमिक अतिरेक को छानना](https://arxiv.org/abs/2006.03236) जिहांग दाई, गुओकुन लाई, यिमिंग यांग, क्वोक वी. ले ​​द्वारा रिहाई।
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST से) साथ वाला पेपर [वर्टिकल कटडेप्थ के साथ मोनोकुलर डेप्थ एस्टीमेशन के लिए ग्लोबल-लोकल पाथ नेटवर्क्स](https:/ /arxiv.org/abs/2201.07436) डोयोन किम, वूंगह्युन गा, प्युंगवान आह, डोंगग्यू जू, सेहवान चुन, जुनमो किम द्वारा।
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI से) साथ में दिया गया पेपर [जेनरेटिव प्री-ट्रेनिंग द्वारा भाषा की समझ में सुधार](https://blog .openai.com/language-unsupervised/) एलेक रैडफोर्ड, कार्तिक नरसिम्हन, टिम सालिमन्स और इल्या सुत्स्केवर द्वारा।
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI से) रिपॉजिटरी के साथ [EleutherAI/gpt-neo](https://github.com/ EleutherAI /gpt-neo) रिलीज। सिड ब्लैक, स्टेला बिडरमैन, लियो गाओ, फिल वांग और कॉनर लेही द्वारा पोस्ट किया गया।
@@ -300,8 +301,8 @@ conda install -c huggingface transformers
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (अबेजा के जरिए) शिन्या ओटानी, ताकायोशी मकाबे, अनुज अरोड़ा, क्यो हटोरी द्वारा।
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (ओपनएआई से) साथ में पेपर [लैंग्वेज मॉडल्स अनसुपरवाइज्ड मल्टीटास्क लर्नर्स हैं](https://blog.openai.com/better-language-models/) एलेक रैडफोर्ड*, जेफरी वू*, रेवन चाइल्ड, डेविड लुआन, डारियो एमोडी* द्वारा * और इल्या सुत्सकेवर** ने पोस्ट किया।
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI से) साथ वाला पेपर [kingoflolz/mesh-transformer-jax](https://github. com/kingoflolz/mesh-transformer-jax/) बेन वांग और अरन कोमात्सुजाकी द्वारा।
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[Graphormer](https://huggingface.co/docs/transformers/main/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv .org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https ://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा।
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https:// arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा।
@@ -322,7 +323,7 @@ conda install -c huggingface transformers
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (फेसबुक से) साथ देने वाला पेपर [बियॉन्ड इंग्लिश-सेंट्रिक मल्टीलिंगुअल मशीन ट्रांसलेशन](https://arxiv.org/ एब्स/2010.11125) एंजेला फैन, श्रुति भोसले, होल्गर श्वेन्क, झी मा, अहमद अल-किश्की, सिद्धार्थ गोयल, मनदीप बैनेस, ओनूर सेलेबी, गुइल्लाम वेन्जेक, विश्रव चौधरी, नमन गोयल, टॉम बर्च, विटाली लिपचिंस्की, सर्गेई एडुनोव, एडौर्ड द्वारा ग्रेव, माइकल औली, आर्मंड जौलिन द्वारा पोस्ट किया गया।
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg द्वारा [OPUS](http://opus.nlpl.eu/) डेटा से प्रशिक्षित मशीनी अनुवाद मॉडल पोस्ट किया गया टाइडेमैन द्वारा। [मैरियन फ्रेमवर्क](https://marian-nmt.github.io/) माइक्रोसॉफ्ट ट्रांसलेटर टीम द्वारा विकसित।
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (माइक्रोसॉफ्ट रिसर्च एशिया से) साथ में पेपर [मार्कअपएलएम: विजुअली-रिच डॉक्यूमेंट अंडरस्टैंडिंग के लिए टेक्स्ट और मार्कअप लैंग्वेज का प्री-ट्रेनिंग] (https://arxiv.org/abs/2110.08518) जुनलॉन्ग ली, यिहेंग जू, लेई कुई, फुरु द्वारा वी द्वारा पोस्ट किया गया।
-1. **[Mask2Former](https://huggingface.co/docs/transformers/main/model_doc/mask2former)** (FAIR and UIUC से) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. द्वाराअनुसंधान पत्र [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) के साथ जारी किया गया
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC से) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. द्वाराअनुसंधान पत्र [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) के साथ जारी किया गया
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (मेटा और UIUC से) पेपर के साथ जारी किया गया [प्रति-पिक्सेल वर्गीकरण वह सब नहीं है जिसकी आपको सिमेंटिक सेगमेंटेशन की आवश्यकता है] (https://arxiv.org/abs/2107.06278) बोवेन चेंग, अलेक्जेंडर जी. श्विंग, अलेक्जेंडर किरिलोव द्वारा >>>>>> रिबेस ठीक करें
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (फेसबुक से) साथ में पेपर [न्यूरल मशीन ट्रांसलेशन के लिए मल्टीलिंगुअल डीनोइजिंग प्री-ट्रेनिंग](https://arxiv. org/abs/2001.08210) यिनहान लियू, जियाताओ गु, नमन गोयल, जियान ली, सर्गेई एडुनोव, मार्जन ग़ज़विनिनेजाद, माइक लुईस, ल्यूक ज़ेटलमॉयर द्वारा।
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (फेसबुक से) साथ में पेपर [एक्स्टेंसिबल बहुभाषी प्रीट्रेनिंग और फाइनट्यूनिंग के साथ बहुभाषी अनुवाद](https://arxiv युकिंग टैंग, चाउ ट्रान, जियान ली, पेंग-जेन चेन, नमन गोयल, विश्रव चौधरी, जियाताओ गु, एंजेला फैन द्वारा .org/abs/2008.00401)।
@@ -340,7 +341,7 @@ conda install -c huggingface transformers
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (हुआवेई नूह के आर्क लैब से) साथ में कागज़ [NEZHA: चीनी भाषा समझ के लिए तंत्रिका प्रासंगिक प्रतिनिधित्व](https :/ /arxiv.org/abs/1909.00204) जुन्किउ वेई, ज़ियाओज़े रेन, ज़िआओगुआंग ली, वेनयोंग हुआंग, यी लियाओ, याशेंग वांग, जियाशू लिन, शिन जियांग, जिओ चेन और कुन लियू द्वारा।
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (फ्रॉम मेटा) साथ में पेपर [नो लैंग्वेज लेफ्ट बिहाइंड: स्केलिंग ह्यूमन-सेंटेड मशीन ट्रांसलेशन] (https://arxiv.org/abs/2207.04672) एनएलएलबी टीम द्वारा प्रकाशित।
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (विस्कॉन्सिन विश्वविद्यालय - मैडिसन से) साथ में कागज [Nyströmformer: A Nyström- आधारित एल्गोरिथम आत्म-ध्यान का अनुमान लगाने के लिए ](https://arxiv.org/abs/2102.03902) युनयांग ज़िओंग, झानपेंग ज़ेंग, रुद्रसिस चक्रवर्ती, मिंगक्सिंग टैन, ग्लेन फंग, यिन ली, विकास सिंह द्वारा पोस्ट किया गया।
-1. **[OneFormer](https://huggingface.co/docs/transformers/main/model_doc/oneformer)** (SHI Labs से) पेपर [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) जितेश जैन, जिआचेन ली, मांगटिक चिउ, अली हसनी, निकिता ओरलोव, हम्फ्री शि के द्वारा जारी किया गया है।
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs से) पेपर [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) जितेश जैन, जिआचेन ली, मांगटिक चिउ, अली हसनी, निकिता ओरलोव, हम्फ्री शि के द्वारा जारी किया गया है।
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया।
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
@@ -358,8 +359,8 @@ conda install -c huggingface transformers
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (गूगल रिसर्च से) साथ वाला पेपर [पूर्व-प्रशिक्षित भाषा मॉडल में एम्बेडिंग कपलिंग पर पुनर्विचार](https://arxiv .org/pdf/2010.12821.pdf) ह्युंग वोन चुंग, थिबॉल्ट फ़ेवरी, हेनरी त्साई, एम. जॉनसन, सेबेस्टियन रुडर द्वारा।
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (माइक्रोसॉफ्ट रिसर्च से) [डीप रेसिडुअल लर्निंग फॉर इमेज रिकग्निशन] (https://arxiv. org/abs/1512.03385) कैमिंग हे, जियांग्यु झांग, शाओकिंग रेन, जियान सन द्वारा।
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (फेसबुक से), साथ में कागज [मजबूत रूप से अनुकूलित BERT प्रीट्रेनिंग दृष्टिकोण](https://arxiv.org/abs /1907.11692) यिनहान लियू, मायल ओट, नमन गोयल, जिंगफेई डू, मंदार जोशी, डैनकी चेन, ओमर लेवी, माइक लुईस, ल्यूक ज़ेटलमॉयर, वेसेलिन स्टोयानोव द्वारा।
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (झुईई टेक्नोलॉजी से), साथ में पेपर [रोफॉर्मर: रोटरी पोजिशन एंबेडिंग के साथ एन्हांस्ड ट्रांसफॉर्मर] (https://arxiv.org/pdf/2104.09864v1.pdf) जियानलिन सु और यू लू और शेंगफेंग पैन और बो वेन और युनफेंग लियू द्वारा प्रकाशित।
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP से) साथ देने वाला पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स](https ://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योव आर्टज़ी द्वारा।
@@ -370,28 +371,28 @@ conda install -c huggingface transformers
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (बर्कले से) कागज के साथ [SqueezeBERT: कुशल तंत्रिका नेटवर्क के बारे में NLP को कंप्यूटर विज़न क्या सिखा सकता है?](https: //arxiv.org/abs/2006.11316) फॉरेस्ट एन. इनडोला, अल्बर्ट ई. शॉ, रवि कृष्णा, और कर्ट डब्ल्यू. केटज़र द्वारा।
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (माइक्रोसॉफ्ट से) साथ में कागज [स्वाइन ट्रांसफॉर्मर: शिफ्टेड विंडोज का उपयोग कर पदानुक्रमित विजन ट्रांसफॉर्मर](https://arxiv .org/abs/2103.14030) ज़ी लियू, युटोंग लिन, यू काओ, हान हू, यिक्सुआन वेई, झेंग झांग, स्टीफन लिन, बैनिंग गुओ द्वारा।
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft से) साथ वाला पेपर [Swin Transformer V2: स्केलिंग अप कैपेसिटी एंड रेजोल्यूशन](https:// ज़ी लियू, हान हू, युटोंग लिन, ज़ुलिआंग याओ, ज़ेंडा ज़ी, यिक्सुआन वेई, जिया निंग, यू काओ, झेंग झांग, ली डोंग, फुरु वेई, बैनिंग गुओ द्वारा arxiv.org/abs/2111.09883।
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI)कॉलिन रैफेल और नोम शज़ीर और एडम रॉबर्ट्स और कैथरीन ली और शरण नारंग और माइकल मटेना द्वारा साथ में पेपर [एक एकीकृत टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर के साथ स्थानांतरण सीखने की सीमा की खोज] (https://arxiv.org/abs/1910.10683) और यांकी झोउ और वेई ली और पीटर जे लियू।
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (Google AI से) साथ वाला पेपर [google-research/text-to-text-transfer- ट्रांसफॉर्मर](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) कॉलिन रैफेल और नोम शज़ीर और एडम रॉबर्ट्स और कैथरीन ली और शरण नारंग द्वारा और माइकल मटेना और यांकी झोउ और वेई ली और पीटर जे लियू।
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [पबटेबल्स-1एम: टूवर्ड्स कॉम्प्रिहेंसिव टेबल एक्सट्रैक्शन फ्रॉम अनस्ट्रक्चर्ड डॉक्यूमेंट्स ](https://arxiv.org/abs/2110.00061) ब्रैंडन स्मॉक, रोहित पेसाला, रॉबिन अब्राहम द्वारा पोस्ट किया गया।
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI से) साथ में कागज [TAPAS: पूर्व-प्रशिक्षण के माध्यम से कमजोर पर्यवेक्षण तालिका पार्सिंग](https:// arxiv.org/abs/2004.02349) जोनाथन हर्ज़िग, पावेल क्रिज़िस्तोफ़ नोवाक, थॉमस मुलर, फ्रांसेस्को पिकिन्नो और जूलियन मार्टिन ईसेन्च्लोस द्वारा।
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [TAPEX: टेबल प्री-ट्रेनिंग थ्रू लर्निंग अ न्यूरल SQL एक्ज़ीक्यूटर](https: //arxiv.org/abs/2107.07653) कियान लियू, बेई चेन, जियाकी गुओ, मोर्टेज़ा ज़ियादी, ज़ेकी लिन, वीज़ू चेन, जियान-गुआंग लू द्वारा पोस्ट किया गया।
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU की ओर से) कागज के साथ [संस्करण-एक्स: एक ब्लॉग मॉडल चौकस चौक मॉडल मॉडल] (https://arxivorg/abs/1901.02860) क्वोकोक वी. ले, रुस्लैन सलाखुतदी
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler 
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (माइक्रोसॉफ्ट रिसर्च से) साथ में दिया गया पेपर [UniSpeech: यूनिफाइड स्पीच रिप्रेजेंटेशन लर्निंग विद लेबलेड एंड अनलेबल्ड डेटा](https:/ /arxiv.org/abs/2101.07597) चेंगई वांग, यू वू, याओ कियान, केनिची कुमातानी, शुजी लियू, फुरु वेई, माइकल ज़ेंग, ज़ुएदोंग हुआंग द्वारा।
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [UNISPEECH-SAT: यूनिवर्सल स्पीच रिप्रेजेंटेशन लर्निंग विद स्पीकर अवेयर प्री-ट्रेनिंग ](https://arxiv.org/abs/2110.05752) सानयुआन चेन, यू वू, चेंग्यी वांग, झेंगयांग चेन, झूओ चेन, शुजी लियू, जियान वू, याओ कियान, फुरु वेई, जिन्यु ली, जियांगज़ान यू द्वारा पोस्ट किया गया।
-1. **[UPerNet](https://huggingface.co/docs/transformers/main/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (सिंघुआ यूनिवर्सिटी और ननकाई यूनिवर्सिटी से) साथ में पेपर [विजुअल अटेंशन नेटवर्क](https://arxiv.org/ pdf/2202.09741.pdf) मेंग-हाओ गुओ, चेंग-ज़े लू, झेंग-निंग लियू, मिंग-मिंग चेंग, शि-मिन हू द्वारा।
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (मल्टीमीडिया कम्प्यूटिंग ग्रुप, नानजिंग यूनिवर्सिटी से) साथ में पेपर [वीडियोएमएई: मास्क्ड ऑटोएन्कोडर स्व-पर्यवेक्षित वीडियो प्री-ट्रेनिंग के लिए डेटा-कुशल सीखने वाले हैं] (https://arxiv.org/abs/2203.12602) ज़ान टोंग, यिबिंग सॉन्ग, जुए द्वारा वांग, लिमिन वांग द्वारा पोस्ट किया गया।
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain से) साथ में कागज [ViLT: Vision-and-Language Transformer बिना कनवल्शन या रीजन सुपरविजन](https://arxiv.org/abs/2102.03334) वोनजे किम, बोक्यूंग सोन, इल्डू किम द्वारा पोस्ट किया गया।
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (गूगल एआई से) कागज के साथ [एक इमेज इज़ वर्थ 16x16 वर्ड्स: ट्रांसफॉर्मर्स फॉर इमेज रिकॉग्निशन एट स्केल](https://arxiv.org/abs/2010.11929) एलेक्सी डोसोवित्स्की, लुकास बेयर, अलेक्जेंडर कोलेसनिकोव, डिर्क वीसेनबोर्न, शियाओहुआ झाई, थॉमस अनटरथिनर, मुस्तफा देहघानी, मैथियास मिंडरर, जॉर्ज हेगोल्ड, सिल्वेन गेली, जैकब उस्ज़कोरेइट द्वारा हॉल्सबी द्वारा पोस्ट किया गया।
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP से) साथ वाला पेपर [VisualBERT: A Simple and Performant Baseline for Vision and Language](https:/ /arxiv.org/pdf/1908.03557) लियुनियन हेरोल्ड ली, मार्क यात्स्कर, दा यिन, चो-जुई हसीह, काई-वेई चांग द्वारा।
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (मेटा एआई से) साथ में कागज [मास्कड ऑटोएन्कोडर स्केलेबल विजन लर्नर्स हैं](https://arxiv.org/ एब्स/2111.06377) कैमिंग हे, ज़िनेली चेन, सेनिंग ज़ी, यांगहो ली, पिओट्र डॉलर, रॉस गिर्शिक द्वारा।
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (मेटा एआई से) साथ में कागज [लेबल-कुशल सीखने के लिए मास्क्ड स्याम देश के नेटवर्क](https://arxiv. org/abs/2204.07141) महमूद असरान, मथिल्डे कैरन, ईशान मिश्रा, पियोट्र बोजानोवस्की, फ्लोरियन बोर्डेस, पास्कल विंसेंट, आर्मंड जौलिन, माइकल रब्बत, निकोलस बल्लास द्वारा।
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (फेसबुक एआई से) साथ में पेपर [wav2vec 2.0: ए फ्रेमवर्क फॉर सेल्फ-सुपरवाइज्ड लर्निंग ऑफ स्पीच रिप्रेजेंटेशन] (https://arxiv.org/abs/2006.11477) एलेक्सी बेवस्की, हेनरी झोउ, अब्देलरहमान मोहम्मद, माइकल औली द्वारा।
diff --git a/README_ja.md b/README_ja.md
index 15a56e7c701a..5008afbb843d 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -298,7 +298,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 🤗Transformersは現在、以下のアーキテクチャを提供しています（それぞれのハイレベルな要約は[こちら](https://huggingface.co/docs/transformers/model_summary)を参照してください）:
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (Google Research and the Toyota Technological Institute at Chicago から) Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut から公開された研究論文: [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942)
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679)
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679)
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (MIT から) Yuan Gong, Yu-An Chung, James Glass から公開された研究論文: [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778)
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (Facebook から) Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer から公開された研究論文: [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461)
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (École polytechnique から) Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis から公開された研究論文: [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321)
@@ -309,11 +309,11 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research から) Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen から公開された研究論文: [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/)
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (Google Research から) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed から公開された研究論文: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062)
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (Google Research から) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed から公開された研究論文: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062)
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (Microsoft Research AI4Science から) Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu から公開された研究論文: [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9)
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (Google AI から) Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil から公開された研究論文: [Big Transfer (BiT)](https://arxiv.org/abs/1912.11370)Houlsby.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (Microsoft Research AI4Science から) Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu から公開された研究論文: [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9)
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (Google AI から) Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil から公開された研究論文: [Big Transfer (BiT)](https://arxiv.org/abs/1912.11370)Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (Facebook から) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston から公開された研究論文: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637)
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (Facebook から) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston から公開された研究論文: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637)
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (Salesforce から) Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi から公開された研究論文: [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086)
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (Salesforce から) Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi から公開された研究論文: [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086)
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (BigScience workshop から) [BigScience Workshop](https://bigscience.huggingface.co/) から公開されました.
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa から) Adrian de Wynter and Daniel J. Perry から公開された研究論文: [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499)
 1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (Harbin Institute of Technology/Microsoft Research Asia/Intel Labs から) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
@@ -321,6 +321,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)
@@ -344,7 +345,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER から), Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park から公開された研究論文: [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664)
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook から) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih から公開された研究論文: [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906)
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs から) René Ranftl, Alexey Bochkovskiy, Vladlen Koltun から公開された研究論文: [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413)
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (Snap Research から) Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. から公開された研究論文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191)
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (Snap Research から) Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. から公開された研究論文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191)
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University から) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning から公開された研究論文: [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555)
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research から) Sascha Rothe, Shashi Narayan, Aliaksei Severyn から公開された研究論文: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461)
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu から) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu から公開された研究論文: [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223)
@@ -354,7 +355,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (Facebook AI から) Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela から公開された研究論文: [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482)
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research から) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon から公開された研究論文: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824)
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain から) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le から公開された研究論文: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236)
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100)
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100)
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST から) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim から公開された研究論文: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436)
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI から) Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever から公開された研究論文: [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/)
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI から) Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy から公開されたレポジトリー : [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo)
@@ -362,8 +363,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (ABEJA から) Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori からリリース.
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI から) Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** から公開された研究論文: [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/)
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI から) Ben Wang and Aran Komatsuzaki から公開されたレポジトリー [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (AI-Sweden から) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren から公開された研究論文: [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) 
-1. **[Graphormer](https://huggingface.co/docs/transformers/main/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234).
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (AI-Sweden から) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren から公開された研究論文: [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) 
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234).
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094)
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447)
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321)
@@ -384,7 +385,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook から) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin から公開された研究論文: [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125)
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg Tiedemann から. [OPUS](http://opus.nlpl.eu/) を使いながら学習された "Machine translation" (マシントランスレーション) モデル. [Marian Framework](https://marian-nmt.github.io/) はMicrosoft Translator Team　が現在開発中です.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia から) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei から公開された研究論文: [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518)
-1. **[Mask2Former](https://huggingface.co/docs/transformers/main/model_doc/mask2former)** (FAIR and UIUC から) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. から公開された研究論文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC から) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. から公開された研究論文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC から) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov から公開された研究論文: [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer から公開された研究論文: [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210)
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan から公開された研究論文: [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401)
@@ -402,7 +403,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab から) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu から公開された研究論文: [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204)
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta から) the NLLB team から公開された研究論文: [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison から) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh から公開された研究論文: [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902)
-1. **[OneFormer](https://huggingface.co/docs/transformers/main/model_doc/oneformer)** (SHI Labs から) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi から公開された研究論文: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs から) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi から公開された研究論文: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
@@ -420,8 +421,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research から) Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder から公開された研究論文: [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821)
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (Microsoft Research から) Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun から公開された研究論文: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (Facebook から), Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov から公開された研究論文: [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692)
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038)
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038)
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
@@ -432,28 +433,28 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley から) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer から公開された研究論文: [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316)
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft から) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo から公開された研究論文: [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft から) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo から公開された研究論文: [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883)
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345)
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (Google から) William Fedus, Barret Zoph, Noam Shazeer から公開された研究論文: [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345)
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (Google から) William Fedus, Barret Zoph, Noam Shazeer から公開された研究論文: [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (Google AI から) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu から公開された研究論文: [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683)
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (Google AI から) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu から公開されたレポジトリー [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511)
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research から) Brandon Smock, Rohith Pesala, Robin Abraham から公開された研究論文: [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061)
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI から) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos から公開された研究論文: [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349)
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research から) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou から公開された研究論文: [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653)
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)**  (HuggingFace から).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (Facebook から) Gedas Bertasius, Heng Wang, Lorenzo Torresani から公開された研究論文: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095)
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook から) Gedas Bertasius, Heng Wang, Lorenzo Torresani から公開された研究論文: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095)
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley から) Michael Janner, Qiyang Li, Sergey Levine から公開された研究論文: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU から) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov から公開された研究論文: [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft から), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei から公開された研究論文: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282)
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research から) Yi Tay, Mostafa Dehghani, Vinh Q から公開された研究論文: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research から) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang から公開された研究論文: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597)
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research から) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu から公開された研究論文: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752)
-1. **[UPerNet](https://huggingface.co/docs/transformers/main/model_doc/upernet)** (Peking University から) Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. から公開された研究論文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University から) Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. から公開された研究論文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University から) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu から公開された研究論文: [Visual Attention Network](https://arxiv.org/abs/2202.09741)
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University から) Zhan Tong, Yibing Song, Jue Wang, Limin Wang から公開された研究論文: [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602)
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain から) Wonjae Kim, Bokyung Son, Ildoo Kim から公開された研究論文: [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334)
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP から) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang から公開された研究論文: [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557)
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI から) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick から公開された研究論文: [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI から) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas から公開された研究論文: [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141)
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI から) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477)
diff --git a/README_ko.md b/README_ko.md
index be6b566969b0..ffeef5670a17 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -213,7 +213,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 🤗 Transformers는 다음 모델들을 제공합니다 (각 모델의 요약은 [여기](https://huggingface.co/docs/transformers/model_summary)서 확인하세요):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
@@ -224,11 +224,11 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa 에서) Adrian de Wynter and Daniel J. Perry 의 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 논문과 함께 발표했습니다.
 1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
@@ -236,6 +236,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne 에서) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 의 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 논문과 함께 발표했습니다.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce 에서) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 의 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 논문과 함께 발표했습니다.
@@ -259,7 +260,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER 에서) Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 의 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 논문과 함께 발표했습니다.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook 에서) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 의 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 논문과 함께 발표했습니다.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs 에서) René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 의 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 논문과 함께 발표했습니다.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University 에서) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 의 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 논문과 함께 발표했습니다.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research 에서) Sascha Rothe, Shashi Narayan, Aliaksei Severyn 의 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 논문과 함께 발표했습니다.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu 에서) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 의 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) 논문과 함께 발표했습니다.
@@ -269,7 +270,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
@@ -277,8 +278,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI 에서) Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 의 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 논문과 함께 발표했습니다.
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. 
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (AI-Sweden 에서) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 의 [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) 논문과 함께 발표했습니다.
-1. **[Graphormer](https://huggingface.co/docs/transformers/main/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu  의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  논문과 함께 발표했습니다.
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (AI-Sweden 에서) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 의 [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) 논문과 함께 발표했습니다.
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu  의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  논문과 함께 발표했습니다.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다.
@@ -299,7 +300,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook 에서) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 의 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 논문과 함께 발표했습니다.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia 에서) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 의 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 논문과 함께 발표했습니다.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/main/model_doc/mask2former)** (FAIR and UIUC 에서 제공)은 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.의 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)논문과 함께 발표했습니다.
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC 에서 제공)은 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.의 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)논문과 함께 발표했습니다.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC 에서) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 의 [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) 논문과 함께 발표했습니다.
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook 에서) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 의 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 논문과 함께 발표했습니다.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook 에서) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 의 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 논문과 함께 발표했습니다.
@@ -317,7 +318,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab 에서) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 의 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 논문과 함께 발표했습니다.
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta 에서) the NLLB team 의 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 논문과 함께 발표했습니다.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison 에서) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 의 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 논문과 함께 발표했습니다.
-1. **[OneFormer](https://huggingface.co/docs/transformers/main/model_doc/oneformer)** (SHI Labs 에서) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 의 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 논문과 함께 발표했습니다.
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs 에서) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 의 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 논문과 함께 발표했습니다.
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다.
@@ -335,8 +336,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research 에서) Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 의 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 논문과 함께 발표했습니다.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (Microsoft Research 에서) Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 의 [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) 논문과 함께 발표했습니다.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (Facebook 에서) Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 의 a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 논문과 함께 발표했습니다.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (Facebook 에서) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 의 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 논문과 함께 발표했습니다.
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다.
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook 에서) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 의 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 논문과 함께 발표했습니다.
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology 에서) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 의 a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 논문과 함께 발표했습니다.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA 에서) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 의 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 논문과 함께 발표했습니다.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
@@ -347,28 +348,28 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley 에서) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 의 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 논문과 함께 발표했습니다.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft 에서) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 의 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 논문과 함께 발표했습니다.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft 에서) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 의 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 논문과 함께 발표했습니다.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (University of Würzburg 에서) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 의 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 논문과 함께 발표했습니다.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (Google 에서) William Fedus, Barret Zoph, Noam Shazeer. 의 [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) 논문과 함께 발표했습니다.
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg 에서) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 의 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 논문과 함께 발표했습니다.
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (Google 에서) William Fedus, Barret Zoph, Noam Shazeer. 의 [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) 논문과 함께 발표했습니다.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (Google AI 에서) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 의 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 논문과 함께 발표했습니다.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research 에서) Brandon Smock, Rohith Pesala, Robin Abraham 의 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 논문과 함께 발표했습니다.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI 에서) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 의 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 논문과 함께 발표했습니다.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research 에서) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 의 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 논문과 함께 발표했습니다.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (Facebook 에서) Gedas Bertasius, Heng Wang, Lorenzo Torresani 의 [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) 논문과 함께 발표했습니다.
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook 에서) Gedas Bertasius, Heng Wang, Lorenzo Torresani 의 [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) 논문과 함께 발표했습니다.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley 에서) Michael Janner, Qiyang Li, Sergey Levin 의 [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) 논문과 함께 발표했습니다.
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU 에서) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 의 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 논문과 함께 발표했습니다.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft 에서) Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 의 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 논문과 함께 발표했습니다.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research 에서) Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzle 의 [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) 논문과 함께 발표했습니다.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research 에서) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 의 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 논문과 함께 발표했습니다.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research 에서) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 의 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 논문과 함께 발표했습니다.
-1. **[UPerNet](https://huggingface.co/docs/transformers/main/model_doc/upernet)** (Peking University 에서 제공)은 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.의 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)논문과 함께 발표했습니다.
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University 에서 제공)은 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.의 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)논문과 함께 발표했습니다.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University 에서) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 의 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 논문과 함께 발표했습니다.
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University 에서) Zhan Tong, Yibing Song, Jue Wang, Limin Wang 의 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 논문과 함께 발표했습니다.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain 에서) Wonjae Kim, Bokyung Son, Ildoo Kim 의 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 논문과 함께 발표했습니다.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP 에서) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 의 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 논문과 함께 발표했습니다.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI 에서) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 의 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 논문과 함께 발표했습니다.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI 에서) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 의 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) 논문과 함께 발표했습니다.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI 에서) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 의 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 927b59890f27..41fa785a1807 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -237,7 +237,7 @@ conda install -c huggingface transformers
 🤗 Transformers 目前支持如下的架构（模型概述请阅[这里](https://huggingface.co/docs/transformers/model_summary)）：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (来自 BAAI) 伴随论文 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) 由 Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell 发布。
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (来自 BAAI) 伴随论文 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) 由 Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell 发布。
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (来自 MIT) 伴随论文 [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) 由 Yuan Gong, Yu-An Chung, James Glass 发布。
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
@@ -248,11 +248,11 @@ conda install -c huggingface transformers
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (来自 VinAI Research) 伴随论文 [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 由 Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen 发布。
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (来自 Microsoft Research AI4Science) 伴随论文 [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) 由 Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu 发布。
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (来自 Google AI) 伴随论文 [Big Transfer (BiT) 由 Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby 发布。
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (来自 Microsoft Research AI4Science) 伴随论文 [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) 由 Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu 发布。
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (来自 Google AI) 伴随论文 [Big Transfer (BiT) 由 Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby 发布。
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (来自 Salesforce) 伴随论文 [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) 由 Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi 发布。
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (来自 Salesforce) 伴随论文 [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) 由 Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi 发布。
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
 1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
@@ -260,6 +260,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
@@ -283,7 +284,7 @@ conda install -c huggingface transformers
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (来自 NAVER) 伴随论文 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 由 Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 发布。
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (来自 Intel Labs) 伴随论文 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 由 René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 发布。
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (来自 Snap Research) 伴随论文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) 由 Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren 发布。
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (来自 Snap Research) 伴随论文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) 由 Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren 发布。
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (来自 Baidu) 伴随论文 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 发布。
@@ -293,7 +294,7 @@ conda install -c huggingface transformers
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (来自 Microsoft Research) 伴随论文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 由 Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang 发布。
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (来自 Microsoft Research) 伴随论文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 由 Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang 发布。
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (来自 KAIST) 伴随论文 [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 由 Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim 发布。
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
@@ -301,8 +302,8 @@ conda install -c huggingface transformers
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (来自 ABEJA) 由 Shinya Otani, Takayoshi Makabe, Anuj Arora, Kyo Hattori。
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。 
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 
-1. **[Graphormer](https://huggingface.co/docs/transformers/main/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
@@ -323,7 +324,7 @@ conda install -c huggingface transformers
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (来自 Microsoft Research Asia) 伴随论文 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 由 Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 发布。
-1. **[Mask2Former](https://huggingface.co/docs/transformers/main/model_doc/mask2former)** (来自 FAIR and UIUC) 伴随论文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) 由 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar 发布。
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (来自 FAIR and UIUC) 伴随论文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) 由 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar 发布。
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov  >>>>>>> Fix rebase
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
@@ -341,7 +342,7 @@ conda install -c huggingface transformers
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
-1. **[OneFormer](https://huggingface.co/docs/transformers/main/model_doc/oneformer)** (来自 SHI Labs)  伴随论文 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 由 Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 发布。
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (来自 SHI Labs)  伴随论文 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 由 Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 发布。
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
@@ -359,8 +360,8 @@ conda install -c huggingface transformers
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (来自 Facebook) 伴随论文 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 由 Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 发布。
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (来自 Facebook) 伴随论文 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 由 Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 发布。
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
@@ -371,28 +372,28 @@ conda install -c huggingface transformers
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (来自 University of Würzburg) 伴随论文 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 由 Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 发布。
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. 
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (来自 University of Würzburg) 伴随论文 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 由 Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 发布。
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. 
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (来自 Microsoft Research) 伴随论文 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 由 Brandon Smock, Rohith Pesala, Robin Abraham 发布。
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
-1. **[UPerNet](https://huggingface.co/docs/transformers/main/model_doc/upernet)** (来自 Peking University) 伴随论文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) 由 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun 发布。
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (来自 Peking University) 伴随论文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) 由 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun 发布。
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (来自 Tsinghua University and Nankai University) 伴随论文 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 由 Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 发布。
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (来自 Multimedia Computing Group, Nanjing University) 伴随论文 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 由 Zhan Tong, Yibing Song, Jue Wang, Limin Wang 发布。
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (来自 Meta AI) 伴随论文 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 发布.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index f3b19fb209b6..a21d0db8cddd 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -249,7 +249,7 @@ conda install -c huggingface transformers
 🤗 Transformers 目前支援以下的架構（模型概覽請參閱[這裡](https://huggingface.co/docs/transformers/model_summary)）：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
@@ -260,11 +260,11 @@ conda install -c huggingface transformers
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
@@ -272,6 +272,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
@@ -295,7 +296,7 @@ conda install -c huggingface transformers
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER) released with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
@@ -305,7 +306,7 @@ conda install -c huggingface transformers
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
@@ -313,8 +314,8 @@ conda install -c huggingface transformers
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. 
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 
-1. **[Graphormer](https://huggingface.co/docs/transformers/main/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
@@ -335,7 +336,7 @@ conda install -c huggingface transformers
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[Mask2Former](https://huggingface.co/docs/transformers/main/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
@@ -353,7 +354,7 @@ conda install -c huggingface transformers
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OneFormer](https://huggingface.co/docs/transformers/main/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
@@ -371,8 +372,8 @@ conda install -c huggingface transformers
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
@@ -383,28 +384,28 @@ conda install -c huggingface transformers
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. 
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. 
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[UPerNet](https://huggingface.co/docs/transformers/main/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index e9140b7329b9..576157c1c42f 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -73,6 +73,7 @@ The documentation is organized into five sections:
 1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
@@ -255,6 +256,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             clap              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/clap.mdx b/docs/source/en/model_doc/clap.mdx
new file mode 100644
index 000000000000..e321c98b8674
--- /dev/null
+++ b/docs/source/en/model_doc/clap.mdx
@@ -0,0 +1,96 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# clap
+
+## Overview
+
+The clap model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## CLAPConfig
+
+[[autodoc]] CLAPConfig
+    - from_text_vision_configs
+
+## CLAPTextConfig
+
+[[autodoc]] CLAPTextConfig
+
+## CLAPVisionConfig
+
+[[autodoc]] CLAPVisionConfig
+
+## CLAPTokenizer
+
+[[autodoc]] CLAPTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## CLAPTokenizerFast
+
+[[autodoc]] CLAPTokenizerFast
+
+## CLAPImageProcessor
+
+[[autodoc]] CLAPImageProcessor
+    - preprocess
+
+## CLAPFeatureExtractor
+
+[[autodoc]] CLAPFeatureExtractor
+
+## CLAPProcessor
+
+[[autodoc]] CLAPProcessor
+
+## CLAPModel
+
+[[autodoc]] CLAPModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## CLAPTextModel
+
+[[autodoc]] CLAPTextModel
+    - forward
+
+## CLAPTextModelWithProjection
+
+[[autodoc]] CLAPTextModelWithProjection
+    - forward
+
+## CLAPVisionModelWithProjection
+
+[[autodoc]] CLAPVisionModelWithProjection
+    - forward
+
+
+## CLAPVisionModel
+
+[[autodoc]] CLAPVisionModel
+    - forward
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 7079a91f40c3..85d3f6bafa21 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -64,6 +64,7 @@ Ready-made configurations include the following architectures:
 - BLOOM
 - CamemBERT
 - Chinese-CLIP
+- CLAP
 - CLIP
 - CodeGen
 - Conditional DETR
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b65781ab75d5..68180ccadf08 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -202,6 +202,14 @@
         "ChineseCLIPTextConfig",
         "ChineseCLIPVisionConfig",
     ],
+    "models.clap": [
+        "CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CLAPConfig",
+        "CLAPProcessor",
+        "CLAPTextConfig",
+        "CLAPTokenizer",
+        "CLAPVisionConfig",
+    ],
     "models.clip": [
         "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "CLIPConfig",
@@ -656,6 +664,7 @@
     _import_structure["models.blenderbot_small"].append("BlenderbotSmallTokenizerFast")
     _import_structure["models.bloom"].append("BloomTokenizerFast")
     _import_structure["models.camembert"].append("CamembertTokenizerFast")
+    _import_structure["models.clap"].append("CLAPTokenizerFast")
     _import_structure["models.clip"].append("CLIPTokenizerFast")
     _import_structure["models.codegen"].append("CodeGenTokenizerFast")
     _import_structure["models.convbert"].append("ConvBertTokenizerFast")
@@ -788,6 +797,7 @@
     _import_structure["models.blip"].extend(["BlipImageProcessor"])
     _import_structure["models.bridgetower"].append("BridgeTowerImageProcessor")
     _import_structure["models.chinese_clip"].extend(["ChineseCLIPFeatureExtractor", "ChineseCLIPImageProcessor"])
+    _import_structure["models.clap"].extend(["CLAPFeatureExtractor", "CLAPImageProcessor"])
     _import_structure["models.clip"].extend(["CLIPFeatureExtractor", "CLIPImageProcessor"])
     _import_structure["models.conditional_detr"].extend(
         ["ConditionalDetrFeatureExtractor", "ConditionalDetrImageProcessor"]
@@ -1207,6 +1217,17 @@
             "ChineseCLIPVisionModel",
         ]
     )
+    _import_structure["models.clap"].extend(
+        [
+            "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "CLAPModel",
+            "CLAPPreTrainedModel",
+            "CLAPTextModel",
+            "CLAPTextModelWithProjection",
+            "CLAPVisionModel",
+            "CLAPVisionModelWithProjection",
+        ]
+    )
     _import_structure["models.clip"].extend(
         [
             "CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3645,6 +3666,14 @@
         ChineseCLIPTextConfig,
         ChineseCLIPVisionConfig,
     )
+    from .models.clap import (
+        CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CLAPConfig,
+        CLAPProcessor,
+        CLAPTextConfig,
+        CLAPTokenizer,
+        CLAPVisionConfig,
+    )
     from .models.clip import (
         CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CLIPConfig,
@@ -4062,6 +4091,7 @@
         from .models.blenderbot_small import BlenderbotSmallTokenizerFast
         from .models.bloom import BloomTokenizerFast
         from .models.camembert import CamembertTokenizerFast
+        from .models.clap import CLAPTokenizerFast
         from .models.clip import CLIPTokenizerFast
         from .models.codegen import CodeGenTokenizerFast
         from .models.convbert import ConvBertTokenizerFast
@@ -4163,6 +4193,7 @@
         from .models.blip import BlipImageProcessor
         from .models.bridgetower import BridgeTowerImageProcessor
         from .models.chinese_clip import ChineseCLIPFeatureExtractor, ChineseCLIPImageProcessor
+        from .models.clap import CLAPFeatureExtractor, CLAPImageProcessor
         from .models.clip import CLIPFeatureExtractor, CLIPImageProcessor
         from .models.conditional_detr import ConditionalDetrFeatureExtractor, ConditionalDetrImageProcessor
         from .models.convnext import ConvNextFeatureExtractor, ConvNextImageProcessor
@@ -4517,6 +4548,15 @@
             ChineseCLIPTextModel,
             ChineseCLIPVisionModel,
         )
+        from .models.clap import (
+            CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CLAPModel,
+            CLAPPreTrainedModel,
+            CLAPTextModel,
+            CLAPTextModelWithProjection,
+            CLAPVisionModel,
+            CLAPVisionModelWithProjection,
+        )
         from .models.clip import (
             CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
             CLIPModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 9eade475fa40..d83841e0660d 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -43,6 +43,7 @@
     camembert,
     canine,
     chinese_clip,
+    clap,
     clip,
     clipseg,
     codegen,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 1a77eb015378..a66015d9a56e 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -48,6 +48,7 @@
         ("camembert", "CamembertConfig"),
         ("canine", "CanineConfig"),
         ("chinese_clip", "ChineseCLIPConfig"),
+        ("clap", "CLAPConfig"),
         ("clip", "CLIPConfig"),
         ("clipseg", "CLIPSegConfig"),
         ("codegen", "CodeGenConfig"),
@@ -215,6 +216,7 @@
         ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("chinese_clip", "CHINESE_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("clap", "CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clipseg", "CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("codegen", "CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -372,6 +374,7 @@
         ("camembert", "CamemBERT"),
         ("canine", "CANINE"),
         ("chinese_clip", "Chinese-CLIP"),
+        ("clap", "clap"),
         ("clip", "CLIP"),
         ("clipseg", "CLIPSeg"),
         ("codegen", "CodeGen"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 3726f9f238cc..9f619c63bf67 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -40,6 +40,7 @@
         ("audio-spectrogram-transformer", "ASTFeatureExtractor"),
         ("beit", "BeitFeatureExtractor"),
         ("chinese_clip", "ChineseCLIPFeatureExtractor"),
+        ("clap", "CLAPFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
         ("clipseg", "ViTFeatureExtractor"),
         ("conditional_detr", "ConditionalDetrFeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 8f61c7c6eed5..d1245eedff20 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -42,6 +42,7 @@
         ("blip", "BlipImageProcessor"),
         ("bridgetower", "BridgeTowerImageProcessor"),
         ("chinese_clip", "ChineseCLIPImageProcessor"),
+        ("clap", "CLAPImageProcessor"),
         ("clip", "CLIPImageProcessor"),
         ("clipseg", "ViTImageProcessor"),
         ("conditional_detr", "ConditionalDetrImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 2fe18122aaa4..e92b8231d63b 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -47,6 +47,7 @@
         ("camembert", "CamembertModel"),
         ("canine", "CanineModel"),
         ("chinese_clip", "ChineseCLIPModel"),
+        ("clap", "CLAPModel"),
         ("clip", "CLIPModel"),
         ("clipseg", "CLIPSegModel"),
         ("codegen", "CodeGenModel"),
@@ -897,6 +898,7 @@
         ("altclip", "AltCLIPModel"),
         ("blip", "BlipModel"),
         ("chinese_clip", "ChineseCLIPModel"),
+        ("clap", "CLAPModel"),
         ("clip", "CLIPModel"),
         ("clipseg", "CLIPSegModel"),
     ]
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index e55e76aff6e9..4250aa758396 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -45,6 +45,7 @@
         ("blip", "BLIPProcessor"),
         ("bridgetower", "BridgeTowerProcessor"),
         ("chinese_clip", "ChineseCLIPProcessor"),
+        ("clap", "CLAPProcessor"),
         ("clip", "CLIPProcessor"),
         ("clipseg", "CLIPSegProcessor"),
         ("flava", "FlavaProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 7073221d744c..16bad5e3d26e 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -90,6 +90,13 @@
             ),
             ("canine", ("CanineTokenizer", None)),
             ("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+            (
+                "clap",
+                (
+                    " ",
+                    "CLAPTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             (
                 "clip",
                 (
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
new file mode 100644
index 000000000000..a3a0c6882ccf
--- /dev/null
+++ b/src/transformers/models/clap/__init__.py
@@ -0,0 +1,121 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tokenizers_available,
+    is_torch_available,
+    is_vision_available,
+)
+
+
+_import_structure = {
+    "configuration_clap": [
+        "CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CLAPConfig",
+        "CLAPOnnxConfig",
+        "CLAPTextConfig",
+        "CLAPVisionConfig",
+    ],
+    "processing_clap": ["CLAPProcessor"],
+    "tokenization_clap": ["CLAPTokenizer"],
+}
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_clap_fast"] = ["CLAPTokenizerFast"]
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_clap"] = ["CLAPFeatureExtractor"]
+    _import_structure["image_processing_clap"] = ["CLAPImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_clap"] = [
+        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "CLAPModel",
+        "CLAPPreTrainedModel",
+        "CLAPTextModel",
+        "CLAPTextModelWithProjection",
+        "CLAPVisionModel",
+        "CLAPVisionModelWithProjection",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_clap import (
+        CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CLAPConfig,
+        CLAPOnnxConfig,
+        CLAPTextConfig,
+        CLAPVisionConfig,
+    )
+    from .processing_clap import CLAPProcessor
+    from .tokenization_clap import CLAPTokenizer
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_clap_fast import CLAPTokenizerFast
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_clap import CLAPFeatureExtractor
+        from .image_processing_clap import CLAPImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_clap import (
+            CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CLAPModel,
+            CLAPPreTrainedModel,
+            CLAPTextModel,
+            CLAPTextModelWithProjection,
+            CLAPVisionModel,
+            CLAPVisionModelWithProjection,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
new file mode 100644
index 000000000000..38b8db91d00e
--- /dev/null
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -0,0 +1,402 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CLAP model configuration"""
+
+import copy
+import os
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+
+
+if TYPE_CHECKING:
+    from ...processing_utils import ProcessorMixin
+    from ...utils import TensorType
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/config.json",
+}
+
+
+class CLAPTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLAPTextModel`]. It is used to instantiate a CLAP
+    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the text encoder of the CLAP
+    [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the CLAP text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`CLAPModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
+            defaults to 1e-5): The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLAPTextConfig, CLAPTextModel
+
+    >>> # Initializing a CLAPTextConfig with laion-ai/base style configuration
+    >>> configuration = CLAPTextConfig()
+
+    >>> # Initializing a CLAPTextModel (with random weights) from the laion-ai/base style configuration
+    >>> model = CLAPTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clap_text_model"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLAPConfig
+        if config_dict.get("model_type") == "clap":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLAPVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLAPVisionModel`]. It is used to instantiate a
+    CLAP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLAP
+    [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
+            defaults to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLAPVisionConfig, CLAPVisionModel
+
+    >>> # Initializing a CLAPVisionConfig with laion-ai/base style configuration
+    >>> configuration = CLAPVisionConfig()
+
+    >>> # Initializing a CLAPVisionModel (with random weights) from the laion-ai/base style configuration
+    >>> model = CLAPVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clap_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLAPConfig
+        if config_dict.get("model_type") == "clap":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLAPConfig(PretrainedConfig):
+    r"""
+    [`CLAPConfig`] is the configuration class to store the configuration of a [`CLAPModel`]. It is used to instantiate
+    a CLAP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the CLAP
+    [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLAPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLAPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLAP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import CLAPConfig, CLAPModel
+
+    >>> # Initializing a CLAPConfig with laion-ai/base style configuration
+    >>> configuration = CLAPConfig()
+
+    >>> # Initializing a CLAPModel (with random weights) from the laion-ai/base style configuration
+    >>> model = CLAPModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a CLAPConfig from a CLAPTextConfig and a CLAPVisionConfig
+    >>> from transformers import CLAPTextConfig, CLAPVisionConfig
+
+    >>> # Initializing a CLAPText and CLAPVision configuration
+    >>> config_text = CLAPTextConfig()
+    >>> config_vision = CLAPVisionConfig()
+
+    >>> config = CLAPConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "clap"
+    is_composition = True
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+        if text_config_dict is not None:
+            text_config = text_config_dict
+        if vision_config_dict is not None:
+            vision_config = vision_config_dict
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the CLAPTextConfig with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the CLAPVisionConfig with default values.")
+
+        self.text_config = CLAPTextConfig(**text_config)
+        self.vision_config = CLAPVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: CLAPTextConfig, vision_config: CLAPVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLAPConfig`] (or a derived class) from clap text model configuration and clap vision model
+        configuration.
+
+        Returns:
+            [`CLAPConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
+
+class CLAPOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("logits_per_image", {0: "batch"}),
+                ("logits_per_text", {0: "batch"}),
+                ("text_embeds", {0: "batch"}),
+                ("image_embeds", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        framework: Optional["TensorType"] = None,
+    ) -> Mapping[str, Any]:
+
+        text_input_dict = super().generate_dummy_inputs(
+            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
+        )
+        image_input_dict = super().generate_dummy_inputs(
+            processor.feature_extractor, batch_size=batch_size, framework=framework
+        )
+        return {**text_input_dict, **image_input_dict}
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 14
diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
new file mode 100644
index 000000000000..09af4ebc0c7a
--- /dev/null
+++ b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
@@ -0,0 +1,148 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+
+from clap import load
+from transformers import CLAPConfig, CLAPModel
+
+
+def copy_attn_layer(hf_attn_layer, pt_attn_layer):
+    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
+    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
+
+    out_proj_weights = pt_attn_layer.out_proj.weight
+    out_proj_bias = pt_attn_layer.out_proj.bias
+
+    hf_attn_layer.q_proj.weight.data = q_proj
+    hf_attn_layer.q_proj.bias.data = q_proj_bias
+
+    hf_attn_layer.k_proj.weight.data = k_proj
+    hf_attn_layer.k_proj.bias.data = k_proj_bias
+
+    hf_attn_layer.v_proj.weight.data = v_proj
+    hf_attn_layer.v_proj.bias.data = v_proj_bias
+
+    hf_attn_layer.out_proj.weight = out_proj_weights
+    hf_attn_layer.out_proj.bias = out_proj_bias
+
+
+def copy_mlp(hf_mlp, pt_mlp):
+    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
+    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
+
+
+def copy_linear(hf_linear, pt_linear):
+    hf_linear.weight = pt_linear.weight
+    hf_linear.bias = pt_linear.bias
+
+
+def copy_layer(hf_layer, pt_layer):
+    # copy layer norms
+    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
+    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
+
+    # copy MLP
+    copy_mlp(hf_layer.mlp, pt_layer.mlp)
+
+    # copy attn
+    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
+
+
+def copy_layers(hf_layers, pt_layers):
+    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
+        copy_layer(hf_layer, pt_layer)
+
+
+def copy_encoder(hf_encoder, pt_model):
+    # copy  embeds
+    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
+    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
+
+    # copy layer norm
+    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
+
+    # copy hidden layers
+    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
+
+
+def copy_text_model_and_projection(hf_model, pt_model):
+    # copy projection
+    hf_model.text_projection.weight.data = pt_model.text_projection.data.T
+
+    # copy text encoder
+    copy_encoder(hf_model.text_model, pt_model)
+
+
+def copy_vison_model_and_projection(hf_model, pt_model):
+    # copy projection
+    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T
+
+    # copy layer norms
+    copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
+    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
+
+    # copy embeds
+    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
+    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
+    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
+
+    # copy encoder
+    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
+
+
+@torch.no_grad()
+def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = CLAPConfig.from_pretrained(config_path)
+    else:
+        config = CLAPConfig(projection_dim=512, text_config={}, vision_config={})
+
+    hf_model = CLAPModel(config).eval()
+
+    pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
+    pt_model = pt_model.eval()
+
+    copy_text_model_and_projection(hf_model, pt_model)
+    copy_vison_model_and_projection(hf_model, pt_model)
+    hf_model.logit_scale = pt_model.logit_scale
+
+    input_ids = torch.arange(0, 77).unsqueeze(0)
+    pixel_values = torch.randn(1, 3, 224, 224)
+
+    hf_logits_per_image, hf_logits_per_text = hf_model(
+        input_ids=input_ids, pixel_values=pixel_values, return_dict=True
+    )[1:3]
+    pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
+
+    assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
+    assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
+
+    hf_model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    args = parser.parse_args()
+
+    convert_clap_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
new file mode 100644
index 000000000000..2e5ca3be88ff
--- /dev/null
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -0,0 +1,33 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for CLAP."""
+
+import warnings
+
+from ...utils import logging
+from .image_processing_clap import CLAPImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+class CLAPFeatureExtractor(CLAPImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class CLAPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
+            " use CLAPImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
diff --git a/src/transformers/models/clap/image_processing_clap.py b/src/transformers/models/clap/image_processing_clap.py
new file mode 100644
index 000000000000..24444d540d12
--- /dev/null
+++ b/src/transformers/models/clap/image_processing_clap.py
@@ -0,0 +1,338 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for CLAP."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from transformers.utils.generic import TensorType
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    center_crop,
+    convert_to_rgb,
+    get_resize_output_image_size,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import logging
+from ...utils.import_utils import is_vision_available
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class CLAPImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a CLAP image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Image standard deviation.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
+        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" not in size:
+            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
+        returned result will always be of size `size`).
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image in the form of a dictionary with keys `height` and `width`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
+        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            image_mean (`float` or `List[float]`):
+                Image mean.
+            image_std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        **kwargs
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_center_crop:
+            images = [self.center_crop(image=image, size=crop_size) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
new file mode 100644
index 000000000000..31ebab8f0f9d
--- /dev/null
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -0,0 +1,1335 @@
+# coding=utf-8
+# Copyright 2023 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch CLAP model."""
+
+
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_clap import CLAPConfig, CLAPTextConfig, CLAPVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "laion-ai/base"
+
+CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "laion-ai/base",
+    # See all clap models at https://huggingface.co/models?filter=clap
+]
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/Clip.html
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
+
+
+# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->clap
+def clap_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->CLAP
+class CLAPVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->CLAP
+class CLAPTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLAP
+class CLAPOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`CLAPTextModel`].
+        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`CLAPVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`CLAPTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`CLAPVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->CLAP
+class CLAPVisionEmbeddings(nn.Module):
+    def __init__(self, config: CLAPVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->CLAP
+class CLAPTextEmbeddings(nn.Module):
+    def __init__(self, config: CLAPTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->CLAP
+class CLAPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->CLAP
+class CLAPMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->CLAP
+class CLAPEncoderLayer(nn.Module):
+    def __init__(self, config: CLAPConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLAPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.mlp = CLAPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPPreTrainedModel with CLIP->CLAP,clip->clap
+class CLAPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CLAPConfig
+    base_model_prefix = "clap"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLAPTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLAPVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, CLAPAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLAPMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, CLAPModel):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLAPVisionModelWithProjection):
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLAPTextModelWithProjection):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLAPEncoder):
+            module.gradient_checkpointing = value
+
+
+CLAP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`CLAPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLAP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLAP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLAPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLAP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLAPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->CLAP
+class CLAPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLAPEncoderLayer`].
+
+    Args:
+        config: CLAPConfig
+    """
+
+    def __init__(self, config: CLAPConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([CLAPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class CLAPTextTransformer(nn.Module):
+    def __init__(self, config: CLAPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLAPTextEmbeddings(config)
+        self.encoder = CLAPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLAPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        bsz, seq_len = input_shape
+        # CLAP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLAP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clap/model.py#L324
+        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
+            hidden_states.device
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+        ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+        mask.fill_(torch.tensor(torch.finfo(dtype).min))
+        mask.triu_(1)  # zero out the lower diagonal
+        mask = mask.unsqueeze(1)  # expand mask
+        return mask
+
+
+@add_start_docstrings(
+    """The text model from CLAP without any head or projection on top.""",
+    CLAP_START_DOCSTRING,
+)
+class CLAPTextModel(CLAPPreTrainedModel):
+    config_class = CLAPTextConfig
+
+    _no_split_modules = ["CLAPEncoderLayer"]
+
+    def __init__(self, config: CLAPTextConfig):
+        super().__init__(config)
+        self.text_model = CLAPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLAPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLAPTextModel
+
+        >>> model = CLAPTextModel.from_pretrained("laion-ai/base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/base")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLAPVisionTransformer(nn.Module):
+    def __init__(self, config: CLAPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLAPVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim)
+        self.encoder = CLAPEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim)
+
+    @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLAPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The vision model from CLAP without any head or projection on top.""",
+    CLAP_START_DOCSTRING,
+)
+class CLAPVisionModel(CLAPPreTrainedModel):
+    config_class = CLAPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLAPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLAPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLAPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLAPVisionModel
+
+        >>> model = CLAPVisionModel.from_pretrained("laion-ai/base")
+        >>> processor = AutoProcessor.from_pretrained("laion-ai/base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(CLAP_START_DOCSTRING)
+class CLAPModel(CLAPPreTrainedModel):
+    config_class = CLAPConfig
+
+    def __init__(self, config: CLAPConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, CLAPTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLAPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLAPVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLAPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLAPTextTransformer(text_config)
+        self.vision_model = CLAPVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLAPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLAPModel
+
+        >>> model = CLAPModel.from_pretrained("laion-ai/base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/base")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLAPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLAPModel
+
+        >>> model = CLAPModel.from_pretrained("laion-ai/base")
+        >>> processor = AutoProcessor.from_pretrained("laion-ai/base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLAPOutput, config_class=CLAPConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLAPOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLAPModel
+
+        >>> model = CLAPModel.from_pretrained("laion-ai/base")
+        >>> processor = AutoProcessor.from_pretrained("laion-ai/base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = clap_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLAPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+@add_start_docstrings(
+    """
+    CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
+    """,
+    CLAP_START_DOCSTRING,
+)
+class CLAPTextModelWithProjection(CLAPPreTrainedModel):
+    config_class = CLAPTextConfig
+
+    _no_split_modules = ["CLAPEncoderLayer"]
+
+    def __init__(self, config: CLAPTextConfig):
+        super().__init__(config)
+
+        self.text_model = CLAPTextTransformer(config)
+
+        self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLAPTextModelOutput, config_class=CLAPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLAPTextModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLAPTextModelWithProjection
+
+        >>> model = CLAPTextModelWithProjection.from_pretrained("laion-ai/base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/base")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> text_embeds = outputs.text_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+
+        text_embeds = self.text_projection(pooled_output)
+
+        if not return_dict:
+            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return CLAPTextModelOutput(
+            text_embeds=text_embeds,
+            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    CLAP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
+    """,
+    CLAP_START_DOCSTRING,
+)
+class CLAPVisionModelWithProjection(CLAPPreTrainedModel):
+    config_class = CLAPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLAPVisionConfig):
+        super().__init__(config)
+
+        self.vision_model = CLAPVisionTransformer(config)
+
+        self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLAPVisionModelOutput, config_class=CLAPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLAPVisionModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLAPVisionModelWithProjection
+
+        >>> model = CLAPVisionModelWithProjection.from_pretrained("laion-ai/base")
+        >>> processor = AutoProcessor.from_pretrained("laion-ai/base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> image_embeds = outputs.image_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+
+        image_embeds = self.visual_projection(pooled_output)
+
+        if not return_dict:
+            outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return CLAPVisionModelOutput(
+            image_embeds=image_embeds,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
new file mode 100644
index 000000000000..bdee25e87bf9
--- /dev/null
+++ b/src/transformers/models/clap/processing_clap.py
@@ -0,0 +1,146 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for CLAP
+"""
+
+import warnings
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding
+
+
+class CLAPProcessor(ProcessorMixin):
+    r"""
+    Constructs a CLAP processor which wraps a CLAP image processor and a CLAP tokenizer into a single processor.
+
+    [`CLAPProcessor`] offers all the functionalities of [`CLAPImageProcessor`] and [`CLAPTokenizerFast`]. See the
+    [`~CLAPProcessor.__call__`] and [`~CLAPProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`CLAPImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`CLAPTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "CLAPImageProcessor"
+    tokenizer_class = ("CLAPTokenizer", "CLAPTokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to CLAPTokenizerFast's [`~CLAPTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLAPImageProcessor's [`~CLAPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLAPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLAPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
diff --git a/src/transformers/models/clap/tokenization_clap.py b/src/transformers/models/clap/tokenization_clap.py
new file mode 100644
index 000000000000..623fdf60c713
--- /dev/null
+++ b/src/transformers/models/clap/tokenization_clap.py
@@ -0,0 +1,523 @@
+# coding=utf-8
+# Copyright 2023 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for CLAP."""
+
+import json
+import os
+import unicodedata
+from functools import lru_cache
+from typing import List, Optional, Tuple
+
+import regex as re
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "laion-ai/base": 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "laion-ai/base": {},
+}
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
+class BasicTokenizer(object):
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
+        WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class CLAPTokenizer(PreTrainedTokenizer):
+    """
+    Construct a CLAP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token="<|startoftext|>",
+        eos_token="<|endoftext|>",
+        pad_token="<|endoftext|>",  # hack to enable padding
+        **kwargs
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+
+        super().__init__(
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+
+        try:
+            import ftfy
+
+            self.fix_text = ftfy.fix_text
+        except ImportError:
+            logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
+            self.nlp = BasicTokenizer(do_lower_case=True)
+            self.fix_text = None
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"}
+
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A CLAP sequence has the following format:
+
+        - single sequence: `<|startoftext|> X <|endoftext|>`
+
+        Pairs of sequences are not the expected use case, but they will be handled without a separator.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        bos_token = [self.bos_token_id]
+        eos_token = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return bos_token + token_ids_0 + eos_token
+        return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1] + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed. CLAP does not make use of token type ids, therefore a list of
+        zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        bos_token = [self.bos_token_id]
+        eos_token = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(bos_token + token_ids_0 + eos_token) * [0]
+        return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + "</w>"
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        if self.fix_text is None:
+            text = " ".join(self.nlp.tokenize(text))
+        else:
+            text = whitespace_clean(self.fix_text(text)).lower()
+
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        byte_array = bytearray([self.byte_decoder[c] for c in text])
+        text = byte_array.decode("utf-8", errors=self.errors).replace("</w>", " ").strip()
+        return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(merge_file)
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
diff --git a/src/transformers/models/clap/tokenization_clap_fast.py b/src/transformers/models/clap/tokenization_clap_fast.py
new file mode 100644
index 000000000000..8a50fe86ffde
--- /dev/null
+++ b/src/transformers/models/clap/tokenization_clap_fast.py
@@ -0,0 +1,173 @@
+# coding=utf-8
+# Copyright 2023 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+
+
+from typing import List, Optional, Tuple
+
+from tokenizers import pre_tokenizers
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_clap import CLAPTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/merges.txt",
+    },
+    "tokenizer_file": {
+        "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/tokenizer.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "laion-ai/base": 77,
+}
+
+
+class CLAPTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" CLAP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+    Byte-Pair-Encoding.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = CLAPTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        unk_token="<|endoftext|>",
+        bos_token="<|startoftext|>",
+        eos_token="<|endoftext|>",
+        pad_token="<|endoftext|>",  # hack to enable padding
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            merges_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+
+        if not isinstance(self.backend_tokenizer.pre_tokenizer, pre_tokenizers.Sequence):
+            raise ValueError(
+                "The `backend_tokenizer` provided does not match the expected format. The CLAP tokenizer has been"
+                " heavily modified from transformers version 4.17.0. You need to convert the tokenizer you are using"
+                " to be compatible with this version.The easiest way to do so is"
+                ' `CLAPTokenizerFast.from_pretrained("path_to_local_folder_or_hub_repo, from_slow=True)`. If you want'
+                " to use your existing tokenizer, you will have to revert to a version prior to 4.17.0 of"
+                " transformers."
+            )
+
+        self._wrap_decode_method_backend_tokenizer()
+
+    # Very ugly hack to enable padding to have a correct decoding see https://github.com/huggingface/tokenizers/issues/872
+    def _wrap_decode_method_backend_tokenizer(self):
+        orig_decode_method = self.backend_tokenizer.decode
+
+        def new_decode_method(*args, **kwargs):
+            text = orig_decode_method(*args, **kwargs)
+            text = text.replace(self.backend_tokenizer.model.end_of_word_suffix, " ").strip()
+            return text
+
+        self.backend_tokenizer.decode = new_decode_method
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A CLAP sequence has the following format:
+
+        - single sequence: `<|startoftext|> X <|endoftext|>`
+
+        Pairs of sequences are not the expected use case, but they will be handled without a separator.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        bos_token = [self.bos_token_id]
+        eos_token = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return bos_token + token_ids_0 + eos_token
+        return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed. CLAP does not make use of token type ids, therefore a list of
+        zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        bos_token = [self.bos_token_id]
+        eos_token = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(bos_token + token_ids_0 + eos_token) * [0]
+        return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index c5c14b1a5fa0..93ccc0ca451c 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1434,6 +1434,51 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class CLAPModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CLAPPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CLAPTextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CLAPTextModelWithProjection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CLAPVisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CLAPVisionModelWithProjection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py
index 8a24d9bea6b2..218d6be718d3 100644
--- a/src/transformers/utils/dummy_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_tokenizers_objects.py
@@ -66,6 +66,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
+class CLAPTokenizerFast(metaclass=DummyObject):
+    _backends = ["tokenizers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
+
+
 class CLIPTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 6982f69b1468..55969e00e901 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -66,6 +66,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class CLAPFeatureExtractor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
+class CLAPImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class CLIPFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/clap/__init__.py b/tests/models/clap/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/clap/test_image_processing_clap.py b/tests/models/clap/test_image_processing_clap.py
new file mode 100644
index 000000000000..93a43e5096e2
--- /dev/null
+++ b/tests/models/clap/test_image_processing_clap.py
@@ -0,0 +1,305 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingSavingTestMixin
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import CLAPImageProcessor
+
+
+class CLAPImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"shortest_edge": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+        if equal_resolution:
+            image_inputs = []
+            for i in range(self.batch_size):
+                image_inputs.append(
+                    np.random.randint(
+                        255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
+                    )
+                )
+        else:
+            image_inputs = []
+            for i in range(self.batch_size):
+                width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2)
+                image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8))
+
+        if not numpify and not torchify:
+            # PIL expects the channel dimension as last dimension
+            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        if torchify:
+            image_inputs = [torch.from_numpy(x) for x in image_inputs]
+
+        return image_inputs
+
+
+@require_torch
+@require_vision
+class CLAPImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
+
+    image_processing_class = CLAPImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = CLAPImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 20})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs = self.image_processor_tester.prepare_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_inputs(equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+
+@require_torch
+@require_vision
+class CLAPImageProcessingTestFourChannels(ImageProcessingSavingTestMixin, unittest.TestCase):
+
+    image_processing_class = CLAPImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = CLAPImageProcessingTester(self, num_channels=4)
+        self.expected_encoded_image_num_channels = 3
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil_four_channels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.expected_encoded_image_num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.expected_encoded_image_num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
new file mode 100644
index 000000000000..d7f8fb0858a5
--- /dev/null
+++ b/tests/models/clap/test_modeling_clap.py
@@ -0,0 +1,737 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch CLAP model. """
+
+
+import inspect
+import os
+import tempfile
+import unittest
+
+import numpy as np
+
+import requests
+import transformers
+from transformers import CLAPConfig, CLAPTextConfig, CLAPVisionConfig
+from transformers.testing_utils import (
+    is_flax_available,
+    is_pt_flax_cross_test,
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import (
+        CLAPModel,
+        CLAPTextModel,
+        CLAPTextModelWithProjection,
+        CLAPVisionModel,
+        CLAPVisionModelWithProjection,
+    )
+    from transformers.models.clap.modeling_clap import CLAP_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import CLAPProcessor
+
+
+if is_flax_available():
+    import jax.numpy as jnp
+    from transformers.modeling_flax_pytorch_utils import (
+        convert_pytorch_state_dict_to_flax,
+        load_flax_weights_in_pytorch_model,
+    )
+
+
+class CLAPVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return CLAPVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = CLAPVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_with_projection(self, config, pixel_values):
+        model = CLAPVisionModelWithProjection(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class CLAPVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLAP does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (CLAPVisionModel, CLAPVisionModelWithProjection) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = CLAPVisionModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLAPVisionConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="CLAP does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_projection(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
+
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="CLAPVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="CLAPVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = CLAPVisionModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @slow
+    def test_model_with_projection_from_pretrained(self):
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = CLAPVisionModelWithProjection.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertTrue(hasattr(model, "visual_projection"))
+
+
+class CLAPTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return CLAPTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = CLAPTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_with_projection(self, config, input_ids, input_mask):
+        model = CLAPTextModelWithProjection(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class CLAPTextModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (CLAPTextModel, CLAPTextModelWithProjection) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = CLAPTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLAPTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_projection(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
+
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="CLAP does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="CLAPTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="CLAPTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = CLAPTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @slow
+    def test_model_with_projection_from_pretrained(self):
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = CLAPTextModelWithProjection.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertTrue(hasattr(model, "text_projection"))
+
+
+class CLAPModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = CLAPTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = CLAPVisionModelTester(parent, **vision_kwargs)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return CLAPConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = CLAPModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class CLAPModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (CLAPModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+
+    def setUp(self):
+        self.model_tester = CLAPModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="CLAPModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    # override as the `logit_scale` parameter initilization is different for CLAP
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initilized as per the original implementation
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            try:
+                input_ids = inputs_dict["input_ids"]
+                pixel_values = inputs_dict["pixel_values"]  # CLAP needs pixel_values
+                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save CLAPConfig and check if we can load CLAPVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = CLAPVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save CLAPConfig and check if we can load CLAPTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = CLAPTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    # overwrite from common since FlaxCLAPModel returns nested output
+    # which is not supported in the common test
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+
+                # load PyTorch class
+                pt_model = model_class(config).eval()
+                # Flax models don't use the `use_cache` option and cache is not returned as a default.
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    return
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load Flax class
+                fx_model = fx_model_class(config, dtype=jnp.float32)
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                fx_model.params = fx_state
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                # convert inputs to Flax
+                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_outputs = fx_model(**fx_inputs).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pt_model.save_pretrained(tmpdirname)
+                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
+
+                fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
+                self.assertEqual(
+                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
+                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
+
+    # overwrite from common since FlaxCLAPModel returns nested output
+    # which is not supported in the common test
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # load corresponding PyTorch class
+                pt_model = model_class(config).eval()
+
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    # no flax model exists for this class
+                    return
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load Flax class
+                fx_model = fx_model_class(config, dtype=jnp.float32)
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+
+                # make sure weights are tied in PyTorch
+                pt_model.tie_weights()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+
+                fx_outputs = fx_model(**fx_inputs).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+
+                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    fx_model.save_pretrained(tmpdirname)
+                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                with torch.no_grad():
+                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
+
+                self.assertEqual(
+                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = CLAPModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@require_vision
+@require_torch
+class CLAPModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "laion-ai/base"
+        model = CLAPModel.from_pretrained(model_name).to(torch_device)
+        processor = CLAPProcessor.from_pretrained(model_name)
+
+        image = prepare_img()
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
+        ).to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+
+        expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
diff --git a/tests/models/clap/test_processor_clap.py b/tests/models/clap/test_processor_clap.py
new file mode 100644
index 000000000000..2aa90ea58175
--- /dev/null
+++ b/tests/models/clap/test_processor_clap.py
@@ -0,0 +1,202 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers import CLAPTokenizer, CLAPTokenizerFast
+from transformers.models.clap.tokenization_clap import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import CLAPImageProcessor, CLAPProcessor
+
+
+@require_vision
+class CLAPProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        # fmt: off
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]
+        # fmt: on
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+        image_processor_map = {
+            "do_resize": True,
+            "size": 20,
+            "do_center_crop": True,
+            "crop_size": 18,
+            "do_normalize": True,
+            "image_mean": [0.48145466, 0.4578275, 0.40821073],
+            "image_std": [0.26862954, 0.26130258, 0.27577711],
+        }
+        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
+        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
+            json.dump(image_processor_map, fp)
+
+    def get_tokenizer(self, **kwargs):
+        return CLAPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs):
+        return CLAPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_image_processor(self, **kwargs):
+        return CLAPImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_default(self):
+        tokenizer_slow = self.get_tokenizer()
+        tokenizer_fast = self.get_rust_tokenizer()
+        image_processor = self.get_image_processor()
+
+        processor_slow = CLAPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
+        processor_slow.save_pretrained(self.tmpdirname)
+        processor_slow = CLAPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+
+        processor_fast = CLAPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
+        processor_fast.save_pretrained(self.tmpdirname)
+        processor_fast = CLAPProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab())
+        self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertIsInstance(processor_slow.tokenizer, CLAPTokenizer)
+        self.assertIsInstance(processor_fast.tokenizer, CLAPTokenizerFast)
+
+        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertIsInstance(processor_slow.image_processor, CLAPImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, CLAPImageProcessor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = CLAPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = CLAPProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, CLAPTokenizerFast)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, CLAPImageProcessor)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLAPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        image_input = self.prepare_image_inputs()
+
+        input_image_proc = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        for key in input_image_proc.keys():
+            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLAPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLAPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"])
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLAPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLAPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), processor.model_input_names)
diff --git a/tests/models/clap/test_tokenization_clap.py b/tests/models/clap/test_tokenization_clap.py
new file mode 100644
index 000000000000..640b74b329dc
--- /dev/null
+++ b/tests/models/clap/test_tokenization_clap.py
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers import CLAPTokenizer, CLAPTokenizerFast
+from transformers.models.clap.tokenization_clap import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_ftfy, require_tokenizers
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class CLAPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = CLAPTokenizer
+    rust_tokenizer_class = CLAPTokenizerFast
+    test_rust_tokenizer = True
+    from_pretrained_kwargs = {}
+    test_seq2seq = False
+
+    def setUp(self):
+        super().setUp()
+
+        # fmt: off
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]
+        # fmt: on
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return CLAPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return CLAPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower newer"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = CLAPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower newer"
+        bpe_tokens = ["lo", "w", "er</w>", "n", "e", "w", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [10, 2, 16, 9, 3, 2, 16, 20]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    @require_ftfy
+    def test_check_encoding_slow_fast(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
+                text_tokenized_s = tokenizer_s.tokenize(text)
+                text_tokenized_r = tokenizer_r.tokenize(text)
+
+                self.assertListEqual(text_tokenized_s, text_tokenized_r)
+
+                # Test that the tokenization is identical on an example containing a character (Latin Small Letter A
+                # with Tilde) encoded in 2 different ways
+                text = "xa\u0303y" + " " + "x\xe3y"
+                text_tokenized_s = tokenizer_s.tokenize(text)
+                text_tokenized_r = tokenizer_r.tokenize(text)
+
+                self.assertListEqual(text_tokenized_s, text_tokenized_r)
+
+                # Test that the tokenization is identical on unicode of space type
+                spaces_unicodes = [
+                    "\u0009",  # (horizontal tab, '\t')
+                    "\u000B",  # (vertical tab)
+                    "\u000C",  # (form feed)
+                    "\u0020",  # (space, ' ')
+                    "\u200E",  # (left-to-right mark):w
+                    "\u200F",  # (right-to-left mark)
+                ]
+                for unicode_seq in spaces_unicodes:
+                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
+                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
+
+                    self.assertListEqual(text_tokenized_s, text_tokenized_r)
+
+                # Test that the tokenization is identical on unicode of line break type
+                line_break_unicodes = [
+                    "\u000A",  # (line feed, '\n')
+                    "\r\n",  # (carriage return and line feed, '\r\n')
+                    "\u000D",  # (carriage return, '\r')
+                    "\r",  # (carriage return, '\r')
+                    "\u000D",  # (carriage return, '\r')
+                    "\u2028",  # (line separator)
+                    "\u2029",  # (paragraph separator)
+                    # "\u0085", # (next line)
+                ]
+
+                # The tokenization is not identical for the character "\u0085" (next line). The slow version transforms
+                # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
+                # space (and thus into an empty list).
+
+                for unicode_seq in line_break_unicodes:
+                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
+                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
+
+                    self.assertListEqual(text_tokenized_s, text_tokenized_r)
+
+    def test_offsets_mapping_with_different_add_prefix_space_argument(self):
+        # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space`
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                text_of_1_token = "hello"  # `hello` is a token in the vocabulary of `pretrained_name`
+                text = f"{text_of_1_token} {text_of_1_token}"
+
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                    pretrained_name,
+                    use_fast=True,
+                )
+                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
+                self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
+                self.assertEqual(
+                    encoding.offset_mapping[1],
+                    (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
+                )
+
+                text = f" {text}"
+
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                    pretrained_name,
+                    use_fast=True,
+                )
+                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
+                self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
+                self.assertEqual(
+                    encoding.offset_mapping[1],
+                    (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
+                )
+
+    def test_log_warning(self):
+        # Test related to the breaking change introduced in transformers v4.17.0
+        # We need to check that an error in raised when the user try to load a previous version of the tokenizer.
+        with self.assertRaises(ValueError) as context:
+            self.rust_tokenizer_class.from_pretrained("robot-test/old-clap-tokenizer")
+
+        self.assertTrue(
+            context.exception.args[0].startswith(
+                "The `backend_tokenizer` provided does not match the expected format."
+            )
+        )
+
+    @require_ftfy
+    def test_tokenization_python_rust_equals(self):
+        super().test_tokenization_python_rust_equals()
+
+    # overwrite common test
+    def test_added_tokens_do_lower_case(self):
+        # CLAP always lower cases letters
+        pass

From 7547c82c2f0768cc7e7034e445ce05dc743e6c0c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 30 Jan 2023 14:50:10 +0000
Subject: [PATCH 002/197] update

---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_hd.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/index.mdx                      |   1 +
 docs/source/en/serialization.mdx              |   2 +-
 .../models/clap/feature_extraction_clap.py    | 308 +++++++++++++++-
 .../models/clap/image_processing_clap.py      | 338 ------------------
 .../models/clap/processing_clap.py            |  70 ++--
 12 files changed, 342 insertions(+), 384 deletions(-)
 delete mode 100644 src/transformers/models/clap/image_processing_clap.py

diff --git a/README.md b/README.md
index 97864251c155..51dff8b53687 100644
--- a/README.md
+++ b/README.md
@@ -295,6 +295,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_es.md b/README_es.md
index 5d9690fc856d..b83403ce394e 100644
--- a/README_es.md
+++ b/README_es.md
@@ -287,6 +287,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
diff --git a/README_hd.md b/README_hd.md
index d37eab25253e..7d2313622bb8 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -259,6 +259,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (इनरिया/फेसबुक/सोरबोन से) साथ में कागज [CamemBERT: एक टेस्टी फ्रेंच लैंग्वेज मॉडल](https:// arxiv.org/abs/1911.03894) लुई मार्टिन*, बेंजामिन मुलर*, पेड्रो जेवियर ऑर्टिज़ सुआरेज़*, योआन ड्यूपॉन्ट, लॉरेंट रोमरी, एरिक विलेमोन्टे डे ला क्लर्जरी, जैमे सेडाह और बेनोइट सगोट द्वारा।
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा।
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा।
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
diff --git a/README_ja.md b/README_ja.md
index 5008afbb843d..f53d3b2447c6 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -321,6 +321,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
diff --git a/README_ko.md b/README_ko.md
index ffeef5670a17..40e7f121652f 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -236,6 +236,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne 에서) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 의 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 논문과 함께 발표했습니다.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 41fa785a1807..6c75a0f8b20e 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -260,6 +260,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index a21d0db8cddd..225b51fcdd69 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -272,6 +272,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 576157c1c42f..f36534d13829 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -74,6 +74,7 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[clap](model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 85d3f6bafa21..2790a87ea6e0 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -64,7 +64,7 @@ Ready-made configurations include the following architectures:
 - BLOOM
 - CamemBERT
 - Chinese-CLIP
-- CLAP
+- clap
 - CLIP
 - CodeGen
 - Conditional DETR
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 2e5ca3be88ff..cf31ef053127 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -14,20 +14,308 @@
 # limitations under the License.
 """Feature extractor class for CLAP."""
 
-import warnings
+from typing import List, Optional, Union
 
-from ...utils import logging
-from .image_processing_clap import CLAPImageProcessor
+import numpy as np
+from numpy.fft import fft
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
 
+# Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor with Whisper->CLAP
+class CLAPFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a CLAP feature extractor.
+
+    This feature extractor inherits from [`CLAPFeatureExtractor`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
+    Fourier Transform` which should match pytorch's `torch.stft` equivalent.
+
+    Args:
+        feature_size (`int`, defaults to 80):
+            The feature dimension of the extracted features.
+        sampling_rate (`int`, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        hop_length (`int`, defaults to 160):
+            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
+        chunk_length (`int`, defaults to 30):
+            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
+            sequences.
+        n_fft (`int`, defaults to 400):
+            Size of the Fourier transform.
+        padding_value (`float`, *optional*, defaults to 0.0):
+            Padding value used to pad the audio. Should correspond to silences.
+    """
+
+    model_input_names = ["input_features"]
+
+    def __init__(
+        self,
+        feature_size=80,
+        sampling_rate=16000,
+        hop_length=160,
+        chunk_length=30,
+        n_fft=400,
+        padding_value=0.0,
+        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
+        **kwargs
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            return_attention_mask=return_attention_mask,
+            **kwargs,
+        )
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.chunk_length = chunk_length
+        self.n_samples = chunk_length * sampling_rate
+        self.nb_max_frames = self.n_samples // hop_length
+        self.sampling_rate = sampling_rate
+        self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=feature_size)
+
+    def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
+        # Initialize the weights
+        n_mels = int(n_mels)
+        weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+        # Center freqs of each FFT bin
+        fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
+
+        # 'Center freqs' of mel bands - uniformly spaced between limits
+        min_mel = 0.0
+        max_mel = 45.245640471924965
+
+        mels = np.linspace(min_mel, max_mel, n_mels + 2)
+
+        mels = np.asanyarray(mels)
+
+        # Fill in the linear scale
+        f_min = 0.0
+        f_sp = 200.0 / 3
+        freqs = f_min + f_sp * mels
+
+        # And now the nonlinear scale
+        min_log_hz = 1000.0  # beginning of log region (Hz)
+        min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+        logstep = np.log(6.4) / 27.0  # step size for log region
+
+        # If we have vector data, vectorize
+        log_t = mels >= min_log_mel
+        freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
+
+        mel_f = freqs
+
+        fdiff = np.diff(mel_f)
+        ramps = np.subtract.outer(mel_f, fftfreqs)
+
+        for i in range(n_mels):
+            # lower and upper slopes for all bins
+            lower = -ramps[i] / fdiff[i]
+            upper = ramps[i + 2] / fdiff[i + 1]
+
+            # .. then intersect them with each other and zero
+            weights[i] = np.maximum(0, np.minimum(lower, upper))
+
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm[:, np.newaxis]
+
+        return weights
+
+    def fram_wave(self, waveform, center=True):
+        """
+        Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is
+        contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each
+        new frame.
+
+        Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`.
+        """
+        frames = []
+        for i in range(0, waveform.shape[0] + 1, self.hop_length):
+            half_window = (self.n_fft - 1) // 2 + 1
+            if center:
+                start = i - half_window if i > half_window else 0
+                end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
+
+                frame = waveform[start:end]
+
+                if start == 0:
+                    padd_width = (-i + half_window, 0)
+                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+                elif end == waveform.shape[0]:
+                    padd_width = (0, (i - waveform.shape[0] + half_window))
+                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+            else:
+                frame = waveform[i : i + self.n_fft]
+                frame_width = frame.shape[0]
+                if frame_width < waveform.shape[0]:
+                    frame = np.lib.pad(
+                        frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0
+                    )
+
+            frames.append(frame)
+        return np.stack(frames, 0)
+
+    def stft(self, frames, window):
+        """
+        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
+        results as `torch.stft`.
+        """
+        frame_size = frames.shape[1]
+        fft_size = self.n_fft
 
-class CLAPFeatureExtractor(CLAPImageProcessor):
-    def __init__(self, *args, **kwargs) -> None:
-        warnings.warn(
-            "The class CLAPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
-            " use CLAPImageProcessor instead.",
-            FutureWarning,
+        if fft_size is None:
+            fft_size = frame_size
+
+        if fft_size < frame_size:
+            raise ValueError("FFT size must greater or equal the frame size")
+        # number of FFT bins to store
+        num_fft_bins = (fft_size >> 1) + 1
+
+        data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
+        fft_signal = np.zeros(fft_size)
+
+        for f, frame in enumerate(frames):
+            if window is not None:
+                np.multiply(frame, window, out=fft_signal[:frame_size])
+            else:
+                fft_signal[:frame_size] = frame
+            data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
+        return data.T
+
+    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
+        """
+        Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
+        implementation with 1e-5 tolerance.
+        """
+        window = np.hanning(self.n_fft + 1)[:-1]
+
+        frames = self.fram_wave(waveform)
+        stft = self.stft(frames, window=window)
+        magnitudes = np.abs(stft[:, :-1]) ** 2
+
+        filters = self.mel_filters
+        mel_spec = filters @ magnitudes
+
+        log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
+        log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+
+        return log_spec
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        truncation: bool = True,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_attention_mask: Optional[bool] = None,
+        padding: Optional[str] = "max_length",
+        max_length: Optional[int] = None,
+        sampling_rate: Optional[int] = None,
+        **kwargs
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values.
+            truncation (`bool`, *optional*, default to `True`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*, defaults to None):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+                <Tip>
+
+                For CLAP models, `attention_mask` should always be passed for batched inference, to avoid subtle bugs.
+
+                </Tip>
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
+                pipeline.
+            padding_value (`float`, defaults to 0.0):
+                The value that is used to fill the padding values / vectors.
+        """
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
+                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
+                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched = bool(
+            isinstance(raw_speech, (list, tuple))
+            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [np.asarray([raw_speech]).T]
+
+        batched_speech = BatchFeature({"input_features": raw_speech})
+
+        # convert into correct format for padding
+
+        padded_inputs = self.pad(
+            batched_speech,
+            padding=padding,
+            max_length=max_length if max_length else self.n_samples,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
         )
-        super().__init__(*args, **kwargs)
+        # make sure list is in array format
+        input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
+
+        input_features = [self._np_extract_fbank_features(waveform) for waveform in input_features[0]]
+
+        if isinstance(input_features[0], List):
+            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
+        else:
+            padded_inputs["input_features"] = input_features
+
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
diff --git a/src/transformers/models/clap/image_processing_clap.py b/src/transformers/models/clap/image_processing_clap.py
deleted file mode 100644
index 24444d540d12..000000000000
--- a/src/transformers/models/clap/image_processing_clap.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Image processor class for CLAP."""
-
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-
-from transformers.utils.generic import TensorType
-
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import (
-    center_crop,
-    convert_to_rgb,
-    get_resize_output_image_size,
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
-)
-from ...image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from ...utils import logging
-from ...utils.import_utils import is_vision_available
-
-
-logger = logging.get_logger(__name__)
-
-
-if is_vision_available():
-    import PIL
-
-
-class CLAPImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a CLAP image processor.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
-            `do_resize` in the `preprocess` method.
-        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
-            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
-            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
-            method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
-            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
-        do_center_crop (`bool`, *optional*, defaults to `True`):
-            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
-            `preprocess` method.
-        crop_size (`Dict[str, int]` *optional*, defaults to 224):
-            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
-            method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
-            the `preprocess` method.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
-            method.
-        do_normalize:
-            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
-            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
-            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
-            Image standard deviation.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
-            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
-            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-    """
-
-    model_input_names = ["pixel_values"]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        do_center_crop: bool = True,
-        crop_size: Dict[str, int] = None,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = True,
-        **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-        size = size if size is not None else {"shortest_edge": 224}
-        size = get_size_dict(size, default_to_square=False)
-        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
-        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
-
-        self.do_resize = do_resize
-        self.size = size
-        self.resample = resample
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
-        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
-        self.do_convert_rgb = do_convert_rgb
-
-    def resize(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
-        resized to keep the input aspect ratio.
-
-        Args:
-            image (`np.ndarray`):
-                Image to resize.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
-                Resampling filter to use when resiizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size, default_to_square=False)
-        if "shortest_edge" not in size:
-            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
-        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
-        returned result will always be of size `size`).
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image in the form of a dictionary with keys `height` and `width`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
-    def preprocess(
-        self,
-        images: ImageInput,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
-    ) -> PIL.Image.Image:
-        """
-        Preprocess an image or batch of images.
-
-        Args:
-            images (`ImageInput`):
-                Image to preprocess.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
-                the longest edge resized to keep the input aspect ratio.
-            resample (`int`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
-                has an effect if `do_resize` is set to `True`.
-            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
-                Whether to center crop the image.
-            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
-                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
-                `True`.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                - Unset: Return a list of `np.ndarray`.
-                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: defaults to the channel dimension format of the input image.
-        """
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        size = get_size_dict(size, param_name="size", default_to_square=False)
-        resample = resample if resample is not None else self.resample
-        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
-        crop_size = crop_size if crop_size is not None else self.crop_size
-        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
-        images = make_list_of_images(images)
-
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
-        if do_resize and size is None:
-            raise ValueError("Size must be specified if do_resize is True.")
-
-        if do_center_crop and crop_size is None:
-            raise ValueError("Crop size must be specified if do_center_crop is True.")
-
-        if do_rescale and rescale_factor is None:
-            raise ValueError("Rescale factor must be specified if do_rescale is True.")
-
-        if do_normalize and (image_mean is None or image_std is None):
-            raise ValueError("Image mean and std must be specified if do_normalize is True.")
-
-        # PIL RGBA images are converted to RGB
-        if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
-
-        if do_center_crop:
-            images = [self.center_crop(image=image, size=crop_size) for image in images]
-
-        if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
-
-        if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
-
-        data = {"pixel_values": images}
-        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index bdee25e87bf9..79b1d95063d1 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Image/Text processor class for CLAP
+audio/Text processor class for CLAP
 """
 
 import warnings
@@ -24,44 +24,44 @@
 
 class CLAPProcessor(ProcessorMixin):
     r"""
-    Constructs a CLAP processor which wraps a CLAP image processor and a CLAP tokenizer into a single processor.
+    Constructs a CLAP processor which wraps a CLAP feature extractor and a CLAP tokenizer into a single processor.
 
-    [`CLAPProcessor`] offers all the functionalities of [`CLAPImageProcessor`] and [`CLAPTokenizerFast`]. See the
+    [`CLAPProcessor`] offers all the functionalities of [`CLAPFeatureExtractor`] and [`CLAPTokenizerFast`]. See the
     [`~CLAPProcessor.__call__`] and [`~CLAPProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`CLAPImageProcessor`]):
-            The image processor is a required input.
+        feature_extractor ([`CLAPFeatureExtractor`]):
+            The audio processor is a required input.
         tokenizer ([`CLAPTokenizerFast`]):
             The tokenizer is a required input.
     """
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "CLAPImageProcessor"
+    attributes = ["feature_extractor", "tokenizer"]
+    feature_extractor_class = "CLAPFeatureExtractor"
     tokenizer_class = ("CLAPTokenizer", "CLAPTokenizerFast")
 
-    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+    def __init__(self, feature_extractor=None, tokenizer=None, **kwargs):
         if "feature_extractor" in kwargs:
             warnings.warn(
-                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `feature_extractor`"
                 " instead.",
                 FutureWarning,
             )
             feature_extractor = kwargs.pop("feature_extractor")
 
-        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
+        feature_extractor = feature_extractor if feature_extractor is not None else feature_extractor
+        if feature_extractor is None:
+            raise ValueError("You need to specify an `feature_extractor`.")
         if tokenizer is None:
             raise ValueError("You need to specify a `tokenizer`.")
 
-        super().__init__(image_processor, tokenizer)
+        super().__init__(feature_extractor, tokenizer)
 
-    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+    def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
         and `kwargs` arguments to CLAPTokenizerFast's [`~CLAPTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLAPImageProcessor's [`~CLAPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
+        CLAPFeatureExtractor's [`~CLAPFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the doctsring
         of the above two methods for more information.
 
         Args:
@@ -69,10 +69,10 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
+            audios (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a
+                number of channels, and T the sample length of the audio.
 
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
@@ -89,25 +89,25 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
               `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **audio_features** -- Audio features to be fed to a model. Returned when `audios` is not `None`.
         """
 
-        if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
+        if text is None and audios is None:
+            raise ValueError("You have to specify either text or audios. Both cannot be none.")
 
         if text is not None:
             encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
 
-        if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+        if audios is not None:
+            audio_features = self.feature_extractor(audios, return_tensors=return_tensors, **kwargs)
 
-        if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
+        if text is not None and audios is not None:
+            encoding["audio_features"] = audio_features.pixel_values
             return encoding
         elif text is not None:
             return encoding
         else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+            return BatchEncoding(data=dict(**audio_features), tensor_type=return_tensors)
 
     def batch_decode(self, *args, **kwargs):
         """
@@ -126,21 +126,21 @@ def decode(self, *args, **kwargs):
     @property
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
-        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
 
     @property
     def feature_extractor_class(self):
         warnings.warn(
-            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `feature_extractor_class` instead.",
             FutureWarning,
         )
-        return self.image_processor_class
+        return self.feature_extractor_class
 
     @property
     def feature_extractor(self):
         warnings.warn(
-            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            "`feature_extractor` is deprecated and will be removed in v5. Use `feature_extractor` instead.",
             FutureWarning,
         )
-        return self.image_processor
+        return self.feature_extractor

From 23c56ac8c85e13a9b6d0d986f5daf10aed90646a Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 30 Jan 2023 15:32:48 +0000
Subject: [PATCH 003/197] text model ok

---
 .../models/clap/configuration_clap.py         | 124 ++-
 src/transformers/models/clap/modeling_clap.py | 911 ++++++++++++++++--
 2 files changed, 930 insertions(+), 105 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 38b8db91d00e..183d98140504 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -38,94 +38,120 @@
 
 class CLAPTextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`CLAPTextModel`]. It is used to instantiate a CLAP
-    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the text encoder of the CLAP
-    [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
+    This is the configuration class to store the configuration of a [`CLAPTextModel`] or a [`TFCLAPTextModel`]. It is
+    used to instantiate a RoBERTa model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
+    [roberta-base](https://huggingface.co/roberta-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
+
     Args:
-        vocab_size (`int`, *optional*, defaults to 49408):
-            Vocabulary size of the CLAP text model. Defines the number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`CLAPModel`].
-        hidden_size (`int`, *optional*, defaults to 512):
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the RoBERTa model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CLAPTextModel`] or [`TFCLAPTextModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 2048):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 8):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        max_position_embeddings (`int`, *optional*, defaults to 77):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`CLAPTextModel`] or [`TFCLAPTextModel`].
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1):
-            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
-            testing).
-
-    Example:
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
 
     ```python
     >>> from transformers import CLAPTextConfig, CLAPTextModel
 
-    >>> # Initializing a CLAPTextConfig with laion-ai/base style configuration
+    >>> # Initializing a RoBERTa configuration
     >>> configuration = CLAPTextConfig()
 
-    >>> # Initializing a CLAPTextModel (with random weights) from the laion-ai/base style configuration
+    >>> # Initializing a model (with random weights) from the configuration
     >>> model = CLAPTextModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "clap_text_model"
+    model_type = "roberta"
 
     def __init__(
         self,
-        vocab_size=49408,
-        hidden_size=512,
-        intermediate_size=2048,
-        projection_dim=512,
+        vocab_size=50265,
+        hidden_size=768,
+        fusion_hidden_size=768,
         num_hidden_layers=12,
-        num_attention_heads=8,
-        max_position_embeddings=77,
-        hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
-        attention_dropout=0.0,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        fusion_num_hidden_layers=2,
+        projection_dim=512,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=514,
+        type_vocab_size=1,
         initializer_range=0.02,
-        initializer_factor=1.0,
+        layer_norm_eps=1e-12,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
         **kwargs
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.projection_dim = projection_dim
-        self.dropout = dropout
+        self.fusion_hidden_size = fusion_hidden_size
+        self.fusion_num_hidden_layers = fusion_num_hidden_layers
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.max_position_embeddings = max_position_embeddings
-        self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
-        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
@@ -302,7 +328,7 @@ class CLAPConfig(PretrainedConfig):
     is_composition = True
 
     def __init__(
-        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+        self, text_config=None, vision_config=None, logit_scale_init_value=2.6592, **kwargs
     ):
         super().__init__(**kwargs)
 
@@ -325,7 +351,9 @@ def __init__(
         self.text_config = CLAPTextConfig(**text_config)
         self.vision_config = CLAPVisionConfig(**vision_config)
 
-        self.projection_dim = projection_dim
+        self.projection_dim = self.text_config.projection_dim
+        self.hidden_size = self.text_config.hidden_size
+
         self.logit_scale_init_value = logit_scale_init_value
         self.initializer_factor = 1.0
 
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 31ebab8f0f9d..e8a998e7581f 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -16,13 +16,15 @@
 
 
 from dataclasses import dataclass
-from typing import Any, Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Union, List
+import numpy as np
 
 import torch
 import torch.utils.checkpoint
 from torch import nn
 
 from ...activations import ACT2FN
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
@@ -32,6 +34,11 @@
     logging,
     replace_return_docstrings,
 )
+from ...activations import ACT2FN, gelu
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
 from .configuration_clap import CLAPConfig, CLAPTextConfig, CLAPVisionConfig
 
 
@@ -420,60 +427,61 @@ class CLAPPreTrainedModel(PreTrainedModel):
     config_class = CLAPConfig
     base_model_prefix = "clap"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t", r"vision_model.*"]
 
     def _init_weights(self, module):
-        """Initialize the weights"""
-        factor = self.config.initializer_factor
-        if isinstance(module, CLAPTextEmbeddings):
-            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
-            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
-        elif isinstance(module, CLAPVisionEmbeddings):
-            factor = self.config.initializer_factor
-            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
-            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
-            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
-        elif isinstance(module, CLAPAttention):
-            factor = self.config.initializer_factor
-            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
-            out_proj_std = (module.embed_dim**-0.5) * factor
-            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
-            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
-            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
-            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
-        elif isinstance(module, CLAPMLP):
-            factor = self.config.initializer_factor
-            in_proj_std = (
-                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
-            )
-            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
-            nn.init.normal_(module.fc1.weight, std=fc_std)
-            nn.init.normal_(module.fc2.weight, std=in_proj_std)
-        elif isinstance(module, CLAPModel):
-            nn.init.normal_(
-                module.text_projection.weight,
-                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
-            )
-            nn.init.normal_(
-                module.visual_projection.weight,
-                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
-            )
-        elif isinstance(module, CLAPVisionModelWithProjection):
-            nn.init.normal_(
-                module.visual_projection.weight,
-                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
-            )
-        elif isinstance(module, CLAPTextModelWithProjection):
-            nn.init.normal_(
-                module.text_projection.weight,
-                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
-            )
-
-        if isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
+        pass
+        # """Initialize the weights"""
+        # factor = self.config.initializer_factor
+        # if isinstance(module, CLAPTextEmbeddings):
+        #     module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        #     module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        # elif isinstance(module, CLAPVisionEmbeddings):
+        #     factor = self.config.initializer_factor
+        #     nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+        #     nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+        #     nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        # elif isinstance(module, CLAPAttention):
+        #     factor = self.config.initializer_factor
+        #     in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+        #     out_proj_std = (module.embed_dim**-0.5) * factor
+        #     nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+        #     nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+        #     nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+        #     nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        # elif isinstance(module, CLAPMLP):
+        #     factor = self.config.initializer_factor
+        #     in_proj_std = (
+        #         (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+        #     )
+        #     fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+        #     nn.init.normal_(module.fc1.weight, std=fc_std)
+        #     nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        # elif isinstance(module, CLAPModel):
+        #     nn.init.normal_(
+        #         module.text_projection.weight,
+        #         std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+        #     )
+        #     nn.init.normal_(
+        #         module.visual_projection.weight,
+        #         std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+        #     )
+        # elif isinstance(module, CLAPVisionModelWithProjection):
+        #     nn.init.normal_(
+        #         module.visual_projection.weight,
+        #         std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+        #     )
+        # elif isinstance(module, CLAPTextModelWithProjection):
+        #     nn.init.normal_(
+        #         module.text_projection.weight,
+        #         std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+        #     )
+
+        # if isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+        # if isinstance(module, nn.Linear) and module.bias is not None:
+        #     module.bias.data.zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, CLAPEncoder):
@@ -684,6 +692,53 @@ def custom_forward(*inputs):
         )
 
 
+class CLAPFusionBlock(nn.Module):
+    def __init__(self, config: CLAPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.projection_dim
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        
+        self.linear = nn.Linear(embed_dim, embed_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.linear(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class CLAPTextProjectionLayer(nn.Module):
+    def __init__(self, config: CLAPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        projection_dim = config.projection_dim
+        self.activation = ACT2FN[config.hidden_act]
+        
+        self.linear1 = nn.Linear(embed_dim, projection_dim)
+        self.linear2 = nn.Linear(projection_dim, projection_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.linear1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+class CLAPFusionLayer(nn.Module):
+    def __init__(self, config: CLAPTextConfig):
+        super().__init__()
+        self.config = config
+        
+        self.layers = nn.ModuleList([CLAPFusionBlock(config) for _ in range(config.fusion_num_hidden_layers)])
+
+    def forward(self, hidden_states):
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
 class CLAPTextTransformer(nn.Module):
     def __init__(self, config: CLAPTextConfig):
         super().__init__()
@@ -972,17 +1027,22 @@ def __init__(self, config: CLAPConfig):
         text_config = config.text_config
         vision_config = config.vision_config
 
+        self.logit_scale_a = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
         self.projection_dim = config.projection_dim
         self.text_embed_dim = text_config.hidden_size
         self.vision_embed_dim = vision_config.hidden_size
 
-        self.text_model = CLAPTextTransformer(text_config)
+        # self.text_model = CLAPTextTransformer(text_config)
+        self.text_model = CLAPTextModel(text_config)
         self.vision_model = CLAPVisionTransformer(vision_config)
 
-        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
-        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.text_transform = CLAPFusionLayer(text_config)
 
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = CLAPTextProjectionLayer(text_config) 
+        
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1333,3 +1393,740 @@ def forward(
             hidden_states=vision_outputs.hidden_states,
             attentions=vision_outputs.attentions,
         )
+
+
+class CLAPTextEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->CLAPText
+class CLAPTextSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in CLAPTextModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class CLAPTextSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->CLAPText
+class CLAPTextAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = CLAPTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = CLAPTextSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class CLAPTextIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class CLAPTextOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->CLAPText
+class CLAPTextLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = CLAPTextAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = CLAPTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = CLAPTextIntermediate(config)
+        self.output = CLAPTextOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->CLAPText
+class CLAPTextEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([CLAPTextLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class CLAPTextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class CLAPTextPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CLAPTextConfig
+    base_model_prefix = "claptext"
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLAPTextEncoder):
+            module.gradient_checkpointing = value
+
+    def update_keys_to_ignore(self, config, del_keys_to_ignore):
+        """Remove some keys from ignore list"""
+        if not config.tie_word_embeddings:
+            # must make a new list, or the class variable gets modified!
+            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
+            self._keys_to_ignore_on_load_missing = [
+                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
+            ]
+
+
+class CLAPTextModel(CLAPTextPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
+
+    """
+
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->CLAPText
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = CLAPTextEmbeddings(config)
+        self.encoder = CLAPTextEncoder(config)
+
+        self.pooler = CLAPTextPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
\ No newline at end of file

From 1771782fb13f1313312db4ec523428afcc41aed2 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 30 Jan 2023 17:13:39 +0000
Subject: [PATCH 004/197] clap text works

---
 .../models/clap/configuration_clap.py         |  159 +-
 src/transformers/models/clap/modeling_clap.py | 3426 +++++++++--------
 2 files changed, 1889 insertions(+), 1696 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 183d98140504..c4682923d524 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -59,9 +59,9 @@ class CLAPTextConfig(PretrainedConfig):
             Number of attention heads for each attention layer in the Transformer encoder.
         intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"relu"`,
+            `"relu"`, `"silu"` and `"relu_new"` are supported.
         hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
@@ -114,14 +114,13 @@ def __init__(
         num_attention_heads=12,
         intermediate_size=3072,
         hidden_act="gelu",
-        fusion_num_hidden_layers=2,
-        projection_dim=512,
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         max_position_embeddings=514,
         type_vocab_size=1,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
+        projection_hidden_size=768,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
@@ -135,12 +134,10 @@ def __init__(
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.fusion_hidden_size = fusion_hidden_size
-        self.fusion_num_hidden_layers = fusion_num_hidden_layers
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.hidden_act = hidden_act
         self.intermediate_size = intermediate_size
-        self.projection_dim = projection_dim
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.max_position_embeddings = max_position_embeddings
@@ -150,6 +147,8 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
         self.classifier_dropout = classifier_dropout
+        self.projection_hidden_size = projection_hidden_size
+
 
 
@@ -194,9 +193,9 @@ class CLAPVisionConfig(PretrainedConfig):
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 32):
             The size (resolution) of each patch.
-        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"relu"`,
+            `"relu"`, `"selu"` and `"relu_new"` ``"relu"` are supported. layer_norm_eps (`float`, *optional*,
             defaults to 1e-5): The epsilon used by the layer normalization layers.
         dropout (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
@@ -224,42 +223,56 @@ class CLAPVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "clap_vision_model"
-
     def __init__(
         self,
-        hidden_size=768,
-        intermediate_size=3072,
-        projection_dim=512,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        num_channels=3,
-        image_size=224,
-        patch_size=32,
-        hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
-        attention_dropout=0.0,
-        initializer_range=0.02,
-        initializer_factor=1.0,
+        sample_rate = 48000,
+        audio_length = 1024,
+        window_size = 8,
+        hop_size = 1024,
+        fmin = 50,
+        fmax = 14000,
+        class_num = 527,
+        mel_bins = 64,
+        clip_samples = 480000,
+        spec_size=256,
+        hidden_act="relu",
+        patch_size=4,
+        patch_stride=(4,4),
+        num_classes=527,
+        hidden_size=96,
+        projection_hidden_size=768,
+        depths=[2,2,6,2],
+        num_heads=[4,8,16,32],
+        enable_fusion=True,
+        hidden_dropout_prob=0.1,
+        fusion_type=None,
         **kwargs
     ):
         super().__init__(**kwargs)
-
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.projection_dim = projection_dim
-        self.dropout = dropout
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
+        self.sample_rate = sample_rate
+        self.audio_length = audio_length
+        self.window_size = window_size
+        self.hop_size = hop_size
+        self.fmin = fmin
+        self.fmax = fmax
+        self.class_num = class_num
+        self.mel_bins = mel_bins
+        self.clip_samples = clip_samples
+        self.spec_size = spec_size
         self.patch_size = patch_size
-        self.image_size = image_size
-        self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
-        self.attention_dropout = attention_dropout
-        self.layer_norm_eps = layer_norm_eps
+        self.patch_stride = patch_stride
+        self.num_classes = num_classes
+        self.hidden_size = hidden_size
+        self.depths = depths
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
         self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.projection_hidden_size = projection_hidden_size
 
+        
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
 
@@ -328,7 +341,14 @@ class CLAPConfig(PretrainedConfig):
     is_composition = True
 
     def __init__(
-        self, text_config=None, vision_config=None, logit_scale_init_value=2.6592, **kwargs
+        self, 
+        text_config=None, 
+        vision_config=None, 
+        logit_scale_init_value=2.6592, 
+        fusion_num_hidden_layers=2,
+        projection_dim=512, 
+        projection_hidden_act="relu",
+        **kwargs
     ):
         super().__init__(**kwargs)
 
@@ -351,7 +371,17 @@ def __init__(
         self.text_config = CLAPTextConfig(**text_config)
         self.vision_config = CLAPVisionConfig(**vision_config)
 
-        self.projection_dim = self.text_config.projection_dim
+        self.text_config.fusion_num_hidden_layers = fusion_num_hidden_layers
+        self.vision_config.fusion_num_hidden_layers = fusion_num_hidden_layers
+
+        self.text_config.projection_dim = projection_dim
+        self.vision_config.projection_dim = projection_dim
+
+        self.text_config.projection_hidden_act = projection_hidden_act
+        self.vision_config.projection_hidden_act = projection_hidden_act
+
+        self.projection_dim = projection_dim
+        self.projection_hidden_act = projection_hidden_act
         self.hidden_size = self.text_config.hidden_size
 
         self.logit_scale_init_value = logit_scale_init_value
@@ -380,51 +410,4 @@ def to_dict(self):
         output["text_config"] = self.text_config.to_dict()
         output["vision_config"] = self.vision_config.to_dict()
         output["model_type"] = self.__class__.model_type
-        return output
-
-
-class CLAPOnnxConfig(OnnxConfig):
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("input_ids", {0: "batch", 1: "sequence"}),
-                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
-                ("attention_mask", {0: "batch", 1: "sequence"}),
-            ]
-        )
-
-    @property
-    def outputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("logits_per_image", {0: "batch"}),
-                ("logits_per_text", {0: "batch"}),
-                ("text_embeds", {0: "batch"}),
-                ("image_embeds", {0: "batch"}),
-            ]
-        )
-
-    @property
-    def atol_for_validation(self) -> float:
-        return 1e-4
-
-    def generate_dummy_inputs(
-        self,
-        processor: "ProcessorMixin",
-        batch_size: int = -1,
-        seq_length: int = -1,
-        framework: Optional["TensorType"] = None,
-    ) -> Mapping[str, Any]:
-
-        text_input_dict = super().generate_dummy_inputs(
-            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
-        )
-        image_input_dict = super().generate_dummy_inputs(
-            processor.feature_extractor, batch_size=batch_size, framework=framework
-        )
-        return {**text_input_dict, **image_input_dict}
-
-    @property
-    def default_onnx_opset(self) -> int:
-        return 14
+        return output
\ No newline at end of file
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index e8a998e7581f..c4288794362e 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -13,14 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch CLAP model."""
-
+import collections
+import math
 
 from dataclasses import dataclass
 from typing import Any, Optional, Tuple, Union, List
 import numpy as np
 
+
+from itertools import repeat
 import torch
 import torch.utils.checkpoint
+import torch.nn.functional as F
 from torch import nn
 
 from ...activations import ACT2FN
@@ -52,62 +56,22 @@
 ]
 
 
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-# contrastive loss function, adapted from
-# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/Clip.html
-def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
-    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
-
-
-# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->clap
-def clap_loss(similarity: torch.Tensor) -> torch.Tensor:
-    caption_loss = contrastive_loss(similarity)
-    image_loss = contrastive_loss(similarity.t())
-    return (caption_loss + image_loss) / 2.0
-
-
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->CLAP
-class CLAPVisionModelOutput(ModelOutput):
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
     """
-    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
 
     Args:
-        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
-            The image embeddings obtained by applying the projection layer to the pooler_output.
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
+        x: torch.Tensor x:
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+    Returns: torch.Tensor
     """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
 
-    image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
 @dataclass
@@ -178,314 +142,506 @@ def to_tuple(self) -> Tuple[Any]:
         )
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->CLAP
-class CLAPVisionEmbeddings(nn.Module):
-    def __init__(self, config: CLAPVisionConfig):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.image_size = config.image_size
-        self.patch_size = config.patch_size
-
-        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
-
-        self.patch_embedding = nn.Conv2d(
-            in_channels=config.num_channels,
-            out_channels=self.embed_dim,
-            kernel_size=self.patch_size,
-            stride=self.patch_size,
-            bias=False,
-        )
-
-        self.num_patches = (self.image_size // self.patch_size) ** 2
-        self.num_positions = self.num_patches + 1
-        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
-
-    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
-        batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
-        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
-
-        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
-        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
-        embeddings = embeddings + self.position_embedding(self.position_ids)
-        return embeddings
-
 
-# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->CLAP
-class CLAPTextEmbeddings(nn.Module):
-    def __init__(self, config: CLAPTextConfig):
-        super().__init__()
-        embed_dim = config.hidden_size
+# from PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
 
-        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
-        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
 
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
 
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-    ) -> torch.Tensor:
-        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
 
-        if position_ids is None:
-            position_ids = self.position_ids[:, :seq_length]
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
 
-        if inputs_embeds is None:
-            inputs_embeds = self.token_embedding(input_ids)
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, hidden_size=768, norm_layer=None, flatten=True, patch_stride = 16,
+        enable_fusion=False, fusion_type='None'):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patch_stride = to_2tuple(patch_stride)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.in_chans = in_chans
+        self.hidden_size = hidden_size
+
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+        
+        padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)
 
-        position_embeddings = self.position_embedding(position_ids)
-        embeddings = inputs_embeds + position_embeddings
+        if (self.enable_fusion) and (self.fusion_type == 'channel_map'):
+            self.proj = nn.Conv2d(in_chans*4, hidden_size, kernel_size=patch_size, stride=patch_stride, padding=padding)
+        else:
+            self.proj = nn.Conv2d(in_chans, hidden_size, kernel_size=patch_size, stride=patch_stride, padding=padding)
+        self.norm = norm_layer(hidden_size) if norm_layer else nn.Identity()
+
+        if (self.enable_fusion) and (self.fusion_type in ['daf_2d','aff_2d','iaff_2d']):
+            self.mel_conv2d = nn.Conv2d(in_chans, hidden_size, kernel_size=(patch_size[0], patch_size[1]*3), stride=(patch_stride[0], patch_stride[1] * 3), padding=padding)
+            if self.fusion_type == 'daf_2d':
+                self.fusion_model = DAF()
+            elif self.fusion_type == 'aff_2d':
+                self.fusion_model = AFF(channels=hidden_size, type='2D')
+            elif self.fusion_type == 'iaff_2d':
+                self.fusion_model = iAFF(channels=hidden_size, type='2D')    
+    def forward(self, x, longer_idx = None):
+        if (self.enable_fusion) and (self.fusion_type in ['daf_2d','aff_2d','iaff_2d']):
+            global_x = x[:,0:1,:,:]
+            
+
+            # global processing
+            B, C, H, W = global_x.shape
+            assert H == self.img_size[0] and W == self.img_size[1], \
+                f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+            global_x = self.proj(global_x)
+            TW = global_x.size(-1)
+            if len(longer_idx) > 0:
+                # local processing
+                local_x = x[longer_idx,1:,:,:].contiguous()
+                B, C, H, W = local_x.shape
+                local_x = local_x.view(B*C,1,H,W)
+                local_x = self.mel_conv2d(local_x)
+                local_x = local_x.view(B,C,local_x.size(1),local_x.size(2),local_x.size(3))
+                local_x = local_x.permute((0,2,3,1,4)).contiguous().flatten(3)
+                TB,TC,TH,_ = local_x.size()
+                if local_x.size(-1) < TW:
+                    local_x = torch.cat([local_x, torch.zeros((TB,TC,TH,TW-local_x.size(-1)), device=global_x.device)], dim=-1)
+                else:
+                    local_x = local_x[:,:,:,:TW]
+                
+                global_x[longer_idx] = self.fusion_model(global_x[longer_idx],local_x)
+            x = global_x
+        else:
+            B, C, H, W = x.shape
+            assert H == self.img_size[0] and W == self.img_size[1], \
+                f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+            x = self.proj(x)
+        
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
 
-        return embeddings
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->CLAP
-class CLAPAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
+def variance_scaling_(tensor, scale=1.0, mode='fan_in', distribution='normal'):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == 'fan_in':
+        denom = fan_in
+    elif mode == 'fan_out':
+        denom = fan_out
+    elif mode == 'fan_avg':
+        denom = (fan_in + fan_out) / 2
 
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-        self.scale = self.head_dim**-0.5
-        self.dropout = config.attention_dropout
+    variance = scale / denom
 
-        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_(tensor, std=math.sqrt(variance) / .87962566103423978)
+    elif distribution == "normal":
+        tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        causal_attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode='fan_in', distribution='truncated_normal')
 
-        bsz, tgt_len, embed_dim = hidden_states.size()
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
 
-        # get query proj
-        query_states = self.q_proj(hidden_states) * self.scale
-        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
 
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
 
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
 
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-                f" {attn_weights.size()}"
-            )
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
 
-        # apply the causal_attention_mask first
-        if causal_attention_mask is not None:
-            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
-                    f" {causal_attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
 
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        if output_attentions:
-            # this operation is a bit akward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
         else:
-            attn_weights_reshaped = None
+            attn = self.softmax(attn)
 
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn = self.attn_drop(attn)
 
-        attn_output = torch.bmm(attn_probs, value_states)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn
 
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
+    def extra_repr(self):
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
 
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
 
-        attn_output = self.out_proj(attn_output)
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
 
-        return attn_output, attn_weights_reshaped
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_before_mlp='ln'):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.norm_before_mlp = norm_before_mlp
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        if self.norm_before_mlp == 'ln':
+            self.norm2 = nn.LayerNorm(dim)
+        elif self.norm_before_mlp == 'bn':
+            self.norm2 = lambda x: nn.BatchNorm1d(dim)(x.transpose(1, 2)).transpose(1, 2)
+        else:
+            raise NotImplementedError
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def forward(self, x):
+        # pdb.set_trace()
+        H, W = self.input_resolution
+        # print("H: ", H)
+        # print("W: ", W)
+        # pdb.set_trace()
+        B, L, C = x.shape
+        # assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
 
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
 
-# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->CLAP
-class CLAPMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.activation_fn = ACT2FN[config.hidden_act]
-        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
-        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+        # W-MSA/SW-MSA
+        attn_windows, attn = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-        return hidden_states
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
 
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
 
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->CLAP
-class CLAPEncoderLayer(nn.Module):
-    def __init__(self, config: CLAPConfig):
-        super().__init__()
-        self.embed_dim = config.hidden_size
-        self.self_attn = CLAPAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
-        self.mlp = CLAPMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        causal_attention_mask: torch.Tensor,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-                `(config.encoder_attention_heads,)`.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
+        return x, attn
 
-        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            causal_attention_mask=causal_attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = residual + hidden_states
+    def extra_repr(self):
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
 
-        residual = hidden_states
-        hidden_states = self.layer_norm2(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
 
-        outputs = (hidden_states,)
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/Clip.html
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
 
-        if output_attentions:
-            outputs += (attn_weights,)
 
-        return outputs
+# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->clap
+def clap_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPPreTrainedModel with CLIP->CLAP,clip->clap
-class CLAPPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->CLAP
+class CLAPVisionModelOutput(ModelOutput):
     """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
 
-    config_class = CLAPConfig
-    base_model_prefix = "clap"
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t", r"vision_model.*"]
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
 
-    def _init_weights(self, module):
-        pass
-        # """Initialize the weights"""
-        # factor = self.config.initializer_factor
-        # if isinstance(module, CLAPTextEmbeddings):
-        #     module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
-        #     module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
-        # elif isinstance(module, CLAPVisionEmbeddings):
-        #     factor = self.config.initializer_factor
-        #     nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
-        #     nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
-        #     nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
-        # elif isinstance(module, CLAPAttention):
-        #     factor = self.config.initializer_factor
-        #     in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
-        #     out_proj_std = (module.embed_dim**-0.5) * factor
-        #     nn.init.normal_(module.q_proj.weight, std=in_proj_std)
-        #     nn.init.normal_(module.k_proj.weight, std=in_proj_std)
-        #     nn.init.normal_(module.v_proj.weight, std=in_proj_std)
-        #     nn.init.normal_(module.out_proj.weight, std=out_proj_std)
-        # elif isinstance(module, CLAPMLP):
-        #     factor = self.config.initializer_factor
-        #     in_proj_std = (
-        #         (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
-        #     )
-        #     fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
-        #     nn.init.normal_(module.fc1.weight, std=fc_std)
-        #     nn.init.normal_(module.fc2.weight, std=in_proj_std)
-        # elif isinstance(module, CLAPModel):
-        #     nn.init.normal_(
-        #         module.text_projection.weight,
-        #         std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
-        #     )
-        #     nn.init.normal_(
-        #         module.visual_projection.weight,
-        #         std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
-        #     )
-        # elif isinstance(module, CLAPVisionModelWithProjection):
-        #     nn.init.normal_(
-        #         module.visual_projection.weight,
-        #         std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
-        #     )
-        # elif isinstance(module, CLAPTextModelWithProjection):
-        #     nn.init.normal_(
-        #         module.text_projection.weight,
-        #         std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
-        #     )
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
 
-        # if isinstance(module, nn.LayerNorm):
-        #     module.bias.data.zero_()
-        #     module.weight.data.fill_(1.0)
-        # if isinstance(module, nn.Linear) and module.bias is not None:
-        #     module.bias.data.zero_()
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
 
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, CLAPEncoder):
-            module.gradient_checkpointing = value
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
 CLAP_START_DOCSTRING = r"""
@@ -588,136 +744,32 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->CLAP
-class CLAPEncoder(nn.Module):
-    """
-    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
-    [`CLAPEncoderLayer`].
-
-    Args:
-        config: CLAPConfig
-    """
-
-    def __init__(self, config: CLAPConfig):
+class CLAPFusionBlock(nn.Module):
+    def __init__(self, config: CLAPTextConfig):
         super().__init__()
         self.config = config
-        self.layers = nn.ModuleList([CLAPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
+        hidden_size = config.projection_dim
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        
+        self.linear = nn.Linear(hidden_size, hidden_size)
 
-    def forward(
-        self,
-        inputs_embeds,
-        attention_mask: Optional[torch.Tensor] = None,
-        causal_attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
-        r"""
-        Args:
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Causal mask for the text model. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    def forward(self, hidden_states):
+        hidden_states = self.linear(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
 
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
 
-        hidden_states = inputs_embeds
-        for idx, encoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(encoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    causal_attention_mask,
-                )
-            else:
-                layer_outputs = encoder_layer(
-                    hidden_states,
-                    attention_mask,
-                    causal_attention_mask,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-
-        if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
-        )
-
-
-class CLAPFusionBlock(nn.Module):
-    def __init__(self, config: CLAPTextConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.projection_dim
-        self.activation = ACT2FN[config.hidden_act]
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        
-        self.linear = nn.Linear(embed_dim, embed_dim)
-
-    def forward(self, hidden_states):
-        hidden_states = self.linear(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        return hidden_states
-
-
-class CLAPTextProjectionLayer(nn.Module):
+class CLAPProjectionLayer(nn.Module):
     def __init__(self, config: CLAPTextConfig):
         super().__init__()
         self.config = config
-        embed_dim = config.hidden_size
+        hidden_size = config.projection_hidden_size
         projection_dim = config.projection_dim
-        self.activation = ACT2FN[config.hidden_act]
         
-        self.linear1 = nn.Linear(embed_dim, projection_dim)
+        self.linear1 = nn.Linear(hidden_size, projection_dim)
+        self.activation = ACT2FN[config.projection_hidden_act]
         self.linear2 = nn.Linear(projection_dim, projection_dim)
 
     def forward(self, hidden_states):
@@ -739,627 +791,928 @@ def forward(self, hidden_states):
             hidden_states = layer(hidden_states)
         return hidden_states
 
-class CLAPTextTransformer(nn.Module):
-    def __init__(self, config: CLAPTextConfig):
+
+class CLAPTextEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
         super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
-        self.embeddings = CLAPTextEmbeddings(config)
-        self.encoder = CLAPEncoder(config)
-        self.final_layer_norm = nn.LayerNorm(embed_dim)
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 
-    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLAPTextConfig)
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
 
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if input_ids is None:
-            raise ValueError("You have to specify input_ids")
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
 
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
+        seq_length = input_shape[1]
 
-        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
 
-        bsz, seq_len = input_shape
-        # CLAP's text model uses causal mask, prepare it here.
-        # https://github.com/openai/CLAP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clap/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
-            hidden_states.device
-        )
-        # expand attention_mask
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
 
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
-            causal_attention_mask=causal_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
 
-        last_hidden_state = encoder_outputs[0]
-        last_hidden_state = self.final_layer_norm(last_hidden_state)
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
 
-        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
-        pooled_output = last_hidden_state[
-            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
-            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
-        ]
+        Args:
+            inputs_embeds: torch.Tensor
 
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
 
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
         )
+        return position_ids.unsqueeze(0).expand(input_shape)
 
-    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
-        mask.fill_(torch.tensor(torch.finfo(dtype).min))
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask
 
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->CLAPText
+class CLAPTextSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
 
-@add_start_docstrings(
-    """The text model from CLAP without any head or projection on top.""",
-    CLAP_START_DOCSTRING,
-)
-class CLAPTextModel(CLAPPreTrainedModel):
-    config_class = CLAPTextConfig
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
 
-    _no_split_modules = ["CLAPEncoderLayer"]
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
 
-    def __init__(self, config: CLAPTextConfig):
-        super().__init__(config)
-        self.text_model = CLAPTextTransformer(config)
-        # Initialize weights and apply final processing
-        self.post_init()
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
 
-    def get_input_embeddings(self) -> nn.Module:
-        return self.text_model.embeddings.token_embedding
+        self.is_decoder = config.is_decoder
 
-    def set_input_embeddings(self, value):
-        self.text_model.embeddings.token_embedding = value
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
 
-    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLAPTextConfig)
     def forward(
         self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from transformers import AutoTokenizer, CLAPTextModel
-
-        >>> model = CLAPTextModel.from_pretrained("laion-ai/base")
-        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/base")
-
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
 
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
 
-        return self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
 
+        query_layer = self.transpose_for_scores(mixed_query_layer)
 
-class CLAPVisionTransformer(nn.Module):
-    def __init__(self, config: CLAPVisionConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
 
-        self.embeddings = CLAPVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim)
-        self.encoder = CLAPEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
 
-    @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLAPVisionConfig)
-    def forward(
-        self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
-        hidden_states = self.embeddings(pixel_values)
-        hidden_states = self.pre_layrnorm(hidden_states)
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
 
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
 
-        last_hidden_state = encoder_outputs[0]
-        pooled_output = last_hidden_state[:, 0, :]
-        pooled_output = self.post_layernorm(pooled_output)
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
 
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in CLAPTextModel forward() function)
+            attention_scores = attention_scores + attention_mask
 
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
 
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
 
-@add_start_docstrings(
-    """The vision model from CLAP without any head or projection on top.""",
-    CLAP_START_DOCSTRING,
-)
-class CLAPVisionModel(CLAPPreTrainedModel):
-    config_class = CLAPVisionConfig
-    main_input_name = "pixel_values"
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
 
-    def __init__(self, config: CLAPVisionConfig):
-        super().__init__(config)
-        self.vision_model = CLAPVisionTransformer(config)
-        # Initialize weights and apply final processing
-        self.post_init()
+        context_layer = torch.matmul(attention_probs, value_layer)
 
-    def get_input_embeddings(self) -> nn.Module:
-        return self.vision_model.embeddings.patch_embedding
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
 
-    @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLAPVisionConfig)
-    def forward(
-        self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-        Examples:
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
 
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, CLAPVisionModel
 
-        >>> model = CLAPVisionModel.from_pretrained("laion-ai/base")
-        >>> processor = AutoProcessor.from_pretrained("laion-ai/base")
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class CLAPTextSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
 
-        >>> inputs = processor(images=image, return_tensors="pt")
 
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output  # pooled CLS states
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->CLAPText
+class CLAPTextAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = CLAPTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = CLAPTextSelfOutput(config)
+        self.pruned_heads = set()
 
-        return self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
         )
 
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
 
-@add_start_docstrings(CLAP_START_DOCSTRING)
-class CLAPModel(CLAPPreTrainedModel):
-    config_class = CLAPConfig
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
 
-    def __init__(self, config: CLAPConfig):
-        super().__init__(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
 
-        if not isinstance(config.text_config, CLAPTextConfig):
-            raise ValueError(
-                "config.text_config is expected to be of type CLAPTextConfig but is of type"
-                f" {type(config.text_config)}."
-            )
 
-        if not isinstance(config.vision_config, CLAPVisionConfig):
-            raise ValueError(
-                "config.vision_config is expected to be of type CLAPVisionConfig but is of type"
-                f" {type(config.vision_config)}."
-            )
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class CLAPTextIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
 
-        text_config = config.text_config
-        vision_config = config.vision_config
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
 
-        self.logit_scale_a = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-        self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
 
-        self.projection_dim = config.projection_dim
-        self.text_embed_dim = text_config.hidden_size
-        self.vision_embed_dim = vision_config.hidden_size
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class CLAPTextOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        # self.text_model = CLAPTextTransformer(text_config)
-        self.text_model = CLAPTextModel(text_config)
-        self.vision_model = CLAPVisionTransformer(vision_config)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
 
-        self.text_transform = CLAPFusionLayer(text_config)
 
-        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
-        self.text_projection = CLAPTextProjectionLayer(text_config) 
-        
-        # Initialize weights and apply final processing
-        self.post_init()
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->CLAPText
+class CLAPTextLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = CLAPTextAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = CLAPTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = CLAPTextIntermediate(config)
+        self.output = CLAPTextOutput(config)
 
-    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
-    def get_text_features(
+    def forward(
         self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> torch.FloatTensor:
-        r"""
-        Returns:
-            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
-            applying the projection layer to the pooled output of [`CLAPTextModel`].
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
 
-        Examples:
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
-        ```python
-        >>> from transformers import AutoTokenizer, CLAPModel
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
 
-        >>> model = CLAPModel.from_pretrained("laion-ai/base")
-        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/base")
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
 
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
-        >>> text_features = model.get_text_features(**inputs)
-        ```"""
-        # Use CLAP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
 
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
         )
+        outputs = (layer_output,) + outputs
 
-        pooled_output = text_outputs[1]
-        text_features = self.text_projection(pooled_output)
-
-        return text_features
-
-    @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
-    def get_image_features(
-        self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> torch.FloatTensor:
-        r"""
-        Returns:
-            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
-            applying the projection layer to the pooled output of [`CLAPVisionModel`].
-
-        Examples:
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
 
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, CLAPModel
+        return outputs
 
-        >>> model = CLAPModel.from_pretrained("laion-ai/base")
-        >>> processor = AutoProcessor.from_pretrained("laion-ai/base")
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = processor(images=image, return_tensors="pt")
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->CLAPText
+class CLAPTextEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([CLAPTextLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
 
-        >>> image_features = model.get_image_features(**inputs)
-        ```"""
-        # Use CLAP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
 
-        pooled_output = vision_outputs[1]  # pooled_output
-        image_features = self.visual_projection(pooled_output)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
 
-        return image_features
+            if self.gradient_checkpointing and self.training:
 
-    @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CLAPOutput, config_class=CLAPConfig)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        return_loss: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CLAPOutput]:
-        r"""
-        Returns:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
 
-        Examples:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
 
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, CLAPModel
+                    return custom_forward
 
-        >>> model = CLAPModel.from_pretrained("laion-ai/base")
-        >>> processor = AutoProcessor.from_pretrained("laion-ai/base")
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
 
-        >>> inputs = processor(
-        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
-        ... )
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
 
-        >>> outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
-        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
-        ```"""
-        # Use CLAP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
 
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class CLAPTextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
 
-        image_embeds = vision_outputs[1]
-        image_embeds = self.visual_projection(image_embeds)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
 
-        text_embeds = text_outputs[1]
-        text_embeds = self.text_projection(text_embeds)
 
-        # normalized features
-        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
-        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
 
-        # cosine similarity as logits
-        logit_scale = self.logit_scale.exp()
-        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
-        logits_per_image = logits_per_text.t()
 
-        loss = None
-        if return_loss:
-            loss = clap_loss(logits_per_text)
+class CLAPTextPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = CLAPTextConfig
+    base_model_prefix = "claptext"
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
 
-        if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
-            return ((loss,) + output) if loss is not None else output
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
 
-        return CLAPOutput(
-            loss=loss,
-            logits_per_image=logits_per_image,
-            logits_per_text=logits_per_text,
-            text_embeds=text_embeds,
-            image_embeds=image_embeds,
-            text_model_output=text_outputs,
-            vision_model_output=vision_outputs,
-        )
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLAPTextEncoder):
+            module.gradient_checkpointing = value
+
+    def update_keys_to_ignore(self, config, del_keys_to_ignore):
+        """Remove some keys from ignore list"""
+        if not config.tie_word_embeddings:
+            # must make a new list, or the class variable gets modified!
+            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
+            self._keys_to_ignore_on_load_missing = [
+                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
+            ]
 
 
-@add_start_docstrings(
+class CLAPTextModel(CLAPTextPreTrainedModel):
     """
-    CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
-    """,
-    CLAP_START_DOCSTRING,
-)
-class CLAPTextModelWithProjection(CLAPPreTrainedModel):
-    config_class = CLAPTextConfig
 
-    _no_split_modules = ["CLAPEncoderLayer"]
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
 
-    def __init__(self, config: CLAPTextConfig):
+    """
+
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->CLAPText
+    def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
+        self.config = config
 
-        self.text_model = CLAPTextTransformer(config)
+        self.embeddings = CLAPTextEmbeddings(config)
+        self.encoder = CLAPTextEncoder(config)
 
-        self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
+        self.pooler = CLAPTextPooler(config) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_input_embeddings(self) -> nn.Module:
-        return self.text_model.embeddings.token_embedding
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
 
     def set_input_embeddings(self, value):
-        self.text_model.embeddings.token_embedding = value
+        self.embeddings.word_embeddings = value
 
-    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CLAPTextModelOutput, config_class=CLAPTextConfig)
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CLAPTextModelOutput]:
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
         r"""
-        Returns:
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
 
-        Examples:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
 
-        ```python
-        >>> from transformers import AutoTokenizer, CLAPTextModelWithProjection
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        >>> model = CLAPTextModelWithProjection.from_pretrained("laion-ai/base")
-        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/base")
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
 
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
 
-        >>> outputs = model(**inputs)
-        >>> text_embeds = outputs.text_embeds
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
 
-        text_outputs = self.text_model(
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
             input_ids=input_ids,
-            attention_mask=attention_mask,
             position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-
-        pooled_output = text_outputs[1]
-
-        text_embeds = self.text_projection(pooled_output)
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
 
         if not return_dict:
-            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
-            return tuple(output for output in outputs if output is not None)
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
 
-        return CLAPTextModelOutput(
-            text_embeds=text_embeds,
-            last_hidden_state=text_outputs.last_hidden_state,
-            hidden_states=text_outputs.hidden_states,
-            attentions=text_outputs.attentions,
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
         )
 
 
-@add_start_docstrings(
-    """
-    CLAP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
-    """,
-    CLAP_START_DOCSTRING,
-)
-class CLAPVisionModelWithProjection(CLAPPreTrainedModel):
-    config_class = CLAPVisionConfig
-    main_input_name = "pixel_values"
-
-    def __init__(self, config: CLAPVisionConfig):
-        super().__init__(config)
-
-        self.vision_model = CLAPVisionTransformer(config)
-
-        self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
 
-        # Initialize weights and apply final processing
-        self.post_init()
+# Copied from transformers.models.clip.modeling_clip.CLIPPreTrainedModel with CLIP->CLAP,clip->clap
+class CLAPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
 
-    def get_input_embeddings(self) -> nn.Module:
-        return self.vision_model.embeddings.patch_embedding
+    config_class = CLAPConfig
+    base_model_prefix = "clap"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t", r"vision_model.*"]
 
-    @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CLAPVisionModelOutput, config_class=CLAPVisionConfig)
-    def forward(
-        self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CLAPVisionModelOutput]:
+    def _init_weights(self, module):
+        pass
+        # """Initialize the weights"""
+        # factor = self.config.initializer_factor
+        # if isinstance(module, CLAPTextEmbeddings):
+        #     module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        #     module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        # elif isinstance(module, CLAPVisionEmbeddings):
+        #     factor = self.config.initializer_factor
+        #     nn.init.normal_(module.class_embedding, mean=0.0, std=module.hidden_size**-0.5 * factor)
+        #     nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+        #     nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        # elif isinstance(module, CLAPAttention):
+        #     factor = self.config.initializer_factor
+        #     in_proj_std = (module.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+        #     out_proj_std = (module.hidden_size**-0.5) * factor
+        #     nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+        #     nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+        #     nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+        #     nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        # elif isinstance(module, CLAPMLP):
+        #     factor = self.config.initializer_factor
+        #     in_proj_std = (
+        #         (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+        #     )
+        #     fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+        #     nn.init.normal_(module.fc1.weight, std=fc_std)
+        #     nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        # elif isinstance(module, CLAPModel):
+        #     nn.init.normal_(
+        #         module.text_projection.weight,
+        #         std=module.text_hidden_size**-0.5 * self.config.initializer_factor,
+        #     )
+        #     nn.init.normal_(
+        #         module.visual_projection.weight,
+        #         std=module.vision_hidden_size**-0.5 * self.config.initializer_factor,
+        #     )
+        # elif isinstance(module, CLAPVisionModelWithProjection):
+        #     nn.init.normal_(
+        #         module.visual_projection.weight,
+        #         std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+        #     )
+        # elif isinstance(module, CLAPTextModelWithProjection):
+        #     nn.init.normal_(
+        #         module.text_projection.weight,
+        #         std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+        #     )
+
+        # if isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+        # if isinstance(module, nn.Linear) and module.bias is not None:
+        #     module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLAPEncoder):
+            module.gradient_checkpointing = value
+
+
+
+@add_start_docstrings(CLAP_START_DOCSTRING)
+class CLAPModel(CLAPPreTrainedModel):
+    config_class = CLAPConfig
+
+    def __init__(self, config: CLAPConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, CLAPTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLAPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLAPVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLAPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.logit_scale_a = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
+        self.projection_dim = config.projection_dim
+        self.text_hidden_size = text_config.hidden_size
+        self.vision_hidden_size = vision_config.hidden_size
+
+
+        self.text_model = CLAPTextModel(text_config)
+        self.text_transform = CLAPFusionLayer(text_config)
+        self.text_projection = CLAPProjectionLayer(text_config) 
+
+        self.audio_model = HTSAT_Swin_Transformer(config=vision_config)
+        self.audio_transform = CLAPFusionLayer(vision_config)
+        self.audio_projection = CLAPProjectionLayer(vision_config) 
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLAPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLAPModel
+
+        >>> model = CLAPModel.from_pretrained("laion-ai/base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/base")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+        text_features = F.normalize(text_features, dim=-1)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
         r"""
         Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLAPVisionModel`].
 
         Examples:
 
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import AutoProcessor, CLAPVisionModelWithProjection
+        >>> from transformers import AutoProcessor, CLAPModel
 
-        >>> model = CLAPVisionModelWithProjection.from_pretrained("laion-ai/base")
+        >>> model = CLAPModel.from_pretrained("laion-ai/base")
         >>> processor = AutoProcessor.from_pretrained("laion-ai/base")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -1367,9 +1720,13 @@ def forward(
 
         >>> inputs = processor(images=image, return_tensors="pt")
 
-        >>> outputs = model(**inputs)
-        >>> image_embeds = outputs.image_embeds
+        >>> image_features = model.get_image_features(**inputs)
         ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         vision_outputs = self.vision_model(
@@ -1380,753 +1737,606 @@ def forward(
         )
 
         pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
 
-        image_embeds = self.visual_projection(pooled_output)
-
-        if not return_dict:
-            outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
-            return tuple(output for output in outputs if output is not None)
-
-        return CLAPVisionModelOutput(
-            image_embeds=image_embeds,
-            last_hidden_state=vision_outputs.last_hidden_state,
-            hidden_states=vision_outputs.hidden_states,
-            attentions=vision_outputs.attentions,
-        )
-
-
-class CLAPTextEmbeddings(nn.Module):
-    """
-    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
-    """
-
-    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
-        self.register_buffer(
-            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
-        )
-
-        # End copy
-        self.padding_idx = config.pad_token_id
-        self.position_embeddings = nn.Embedding(
-            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
-        )
+        return image_features
 
+    @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLAPOutput, config_class=CLAPConfig)
     def forward(
-        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
-    ):
-        if position_ids is None:
-            if input_ids is not None:
-                # Create the position ids from the input token ids. Any padded tokens remain padded.
-                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
-            else:
-                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLAPOutput]:
+        r"""
+        Returns:
 
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
+        Examples:
 
-        seq_length = input_shape[1]
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLAPModel
 
-        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
-        # issue #5664
-        if token_type_ids is None:
-            if hasattr(self, "token_type_ids"):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+        >>> model = CLAPModel.from_pretrained("laion-ai/base")
+        >>> processor = AutoProcessor.from_pretrained("laion-ai/base")
 
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        embeddings = inputs_embeds + token_type_embeddings
-        if self.position_embedding_type == "absolute":
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
 
-    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
-        """
-        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
-
-        Args:
-            inputs_embeds: torch.Tensor
-
-        Returns: torch.Tensor
-        """
-        input_shape = inputs_embeds.size()[:-1]
-        sequence_length = input_shape[1]
-
-        position_ids = torch.arange(
-            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
-        )
-        return position_ids.unsqueeze(0).expand(input_shape)
-
-
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->CLAPText
-class CLAPTextSelfAttention(nn.Module):
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({config.num_attention_heads})"
-            )
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = position_embedding_type or getattr(
-            config, "position_embedding_type", "absolute"
-        )
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
-
-        self.is_decoder = config.is_decoder
-
-    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        mixed_query_layer = self.query(hidden_states)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        use_cache = past_key_value is not None
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
-            if use_cache:
-                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
-                    -1, 1
-                )
-            else:
-                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-
-            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
-
-            if self.position_embedding_type == "relative_key":
-                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in CLAPTextModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        if self.is_decoder:
-            outputs = outputs + (past_key_value,)
-        return outputs
-
-
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
-class CLAPTextSelfOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->CLAPText
-class CLAPTextAttention(nn.Module):
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        self.self = CLAPTextSelfAttention(config, position_embedding_type=position_embedding_type)
-        self.output = CLAPTextSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
-        )
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate
-class CLAPTextIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertOutput
-class CLAPTextOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->CLAPText
-class CLAPTextLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = CLAPTextAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            if not self.is_decoder:
-                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
-            self.crossattention = CLAPTextAttention(config, position_embedding_type="absolute")
-        self.intermediate = CLAPTextIntermediate(config)
-        self.output = CLAPTextOutput(config)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
-
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
-        else:
-            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
-
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            if not hasattr(self, "crossattention"):
-                raise ValueError(
-                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
-                    " by setting `config.add_cross_attention=True`"
-                )
-
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
-
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        layer_output = apply_chunking_to_forward(
-            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
-        )
-        outputs = (layer_output,) + outputs
-
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value,)
-
-        return outputs
-
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-
-# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->CLAPText
-class CLAPTextEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList([CLAPTextLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = False,
-        output_hidden_states: Optional[bool] = False,
-        return_dict: Optional[bool] = True,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-
-        next_decoder_cache = () if use_cache else None
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[i] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, past_key_value, output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1],)
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    next_decoder_cache,
-                    all_hidden_states,
-                    all_self_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
 
-# Copied from transformers.models.bert.modeling_bert.BertPooler
-class CLAPTextPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class CLAPTextPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = CLAPTextConfig
-    base_model_prefix = "claptext"
-    supports_gradient_checkpointing = True
-    _no_split_modules = []
-
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, CLAPTextEncoder):
-            module.gradient_checkpointing = value
-
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
-
-class CLAPTextModel(CLAPTextPreTrainedModel):
-    """
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
 
-    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
-    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
-    Kaiser and Illia Polosukhin.
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
 
-    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
-    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
-    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
 
-    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
 
-    """
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
+        loss = None
+        if return_loss:
+            loss = clap_loss(logits_per_text)
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->CLAPText
-    def __init__(self, config, add_pooling_layer=True):
-        super().__init__(config)
-        self.config = config
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
 
-        self.embeddings = CLAPTextEmbeddings(config)
-        self.encoder = CLAPTextEncoder(config)
+        return CLAPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
 
-        self.pooler = CLAPTextPooler(config) if add_pooling_layer else None
+@add_start_docstrings(
+    """
+    CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
+    """,
+    CLAP_START_DOCSTRING,
+)
+class CLAPTextModelWithProjection(CLAPPreTrainedModel):
+    config_class = CLAPTextConfig
 
+    def __init__(self, config: CLAPTextConfig):
+        super().__init__(config)
+        self.text_model = CLAPTextModel(config)
+        self.text_projection = CLAPProjectionLayer(config)
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
 
     def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
+        self.text_model.embeddings.token_embedding = value
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLAPTextModelOutput, config_class=CLAPTextConfig)
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+    ) -> Union[Tuple, CLAPTextModelOutput]:
         r"""
-        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+        Returns:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+        Examples:
 
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
+        ```python
+        >>> from transformers import AutoTokenizer, CLAPTextModelWithProjection
+
+        >>> model = CLAPTextModelWithProjection.from_pretrained("laion-ai/base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/base")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> text_embeds = outputs.text_embeds
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if self.config.is_decoder:
-            use_cache = use_cache if use_cache is not None else self.config.use_cache
-        else:
-            use_cache = False
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
 
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        pooled_output = text_outputs[1]
 
-        batch_size, seq_length = input_shape
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        text_embeds = self.text_projection(pooled_output)
 
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if not return_dict:
+            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
 
-        if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        return CLAPTextModelOutput(
+            text_embeds=text_embeds,
+            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
+        )
 
-        if token_type_ids is None:
-            if hasattr(self.embeddings, "token_type_ids"):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
 
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
 
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
+                 norm_before_mlp='ln'):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop, attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer, norm_before_mlp=norm_before_mlp)
+            for i in range(depth)])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
         else:
-            encoder_extended_attention_mask = None
+            self.downsample = None
 
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+    def forward(self, x):
+        attns = []
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x, attn = blk(x)
+                if not self.training:
+                    attns.append(attn.unsqueeze(0))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        if not self.training:
+            attn = torch.cat(attns, dim = 0)
+            attn = torch.mean(attn, dim = 0)
+        return x, attn
+
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
 
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
 
-        if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
 
-        return BaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-        )
\ No newline at end of file
+        x = x.view(B, H, W, C)
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+    def extra_repr(self):
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+
+
+
+
+# The Core of HTSAT
+class HTSAT_Swin_Transformer(nn.Module):
+    r"""HTSAT based on the Swin Transformer
+    Args:
+        spec_size (int | tuple(int)): Input Spectrogram size. Default 256
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        path_stride (iot | tuple(int)): Patch Stride for Frequency and Time Axis. Default: 4
+        in_chans (int): Number of input image channels. Default: 1 (mono)
+        num_classes (int): Number of classes for classification head. Default: 527
+        hidden_size (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each HTSAT-Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 8
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+        config (module): The configuration Module from config.py
+    """
+
+    def __init__(self, spec_size=256, patch_size=4, patch_stride=(4,4), 
+                in_chans=1, num_classes=527,
+                 hidden_size=96, depths=[2, 2, 6, 2], num_heads=[4, 8, 16, 32],
+                 window_size=8, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm, 
+                 ape=False, patch_norm=True,
+                 use_checkpoint=False, norm_before_mlp='ln', config = None, 
+                 enable_fusion = False, fusion_type = 'None', **kwargs):
+        super(HTSAT_Swin_Transformer, self).__init__()
+
+        self.config = config
+        self.spec_size = spec_size 
+        self.patch_stride = patch_stride
+        self.patch_size = patch_size
+        self.window_size = window_size
+        self.hidden_size = hidden_size
+        self.depths = depths
+        self.ape = ape
+        self.in_chans = in_chans
+        self.num_classes = num_classes
+        self.num_heads = num_heads
+        self.num_layers = len(self.depths)
+        self.num_features = int(self.hidden_size * 2 ** (self.num_layers - 1))
+        
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+
+        self.qkv_bias = qkv_bias
+        self.qk_scale = None
+
+        self.patch_norm = patch_norm
+        self.norm_layer = norm_layer if self.patch_norm else None
+        self.norm_before_mlp = norm_before_mlp
+        self.mlp_ratio = mlp_ratio
+
+        self.use_checkpoint = use_checkpoint
+
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+
+        #  process mel-spec ; used only once
+        self.freq_ratio = self.spec_size // self.config.mel_bins
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        self.interpolate_ratio = 32     # Downsampled ratio
+        # Spectrogram extractor
+        self.bn0 = nn.BatchNorm2d(self.config.mel_bins)
+
+
+        # split spctrogram into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=self.spec_size, patch_size=self.patch_size, in_chans=self.in_chans, 
+            hidden_size=self.hidden_size, norm_layer=self.norm_layer, patch_stride = patch_stride,
+            enable_fusion=self.enable_fusion, fusion_type=self.fusion_type
+            )
+
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.grid_size
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, self.hidden_size))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=self.drop_rate)
+
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, sum(self.depths))]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(dim=int(self.hidden_size * 2 ** i_layer),
+                input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                                    patches_resolution[1] // (2 ** i_layer)),
+                depth=self.depths[i_layer],
+                num_heads=self.num_heads[i_layer],
+                window_size=self.window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=self.qkv_bias, qk_scale=self.qk_scale,
+                drop=self.drop_rate, attn_drop=self.attn_drop_rate,
+                drop_path=dpr[sum(self.depths[:i_layer]):sum(self.depths[:i_layer + 1])],
+                norm_layer=self.norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+                norm_before_mlp=self.norm_before_mlp)
+            self.layers.append(layer)
+
+        self.norm = self.norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.maxpool = nn.AdaptiveMaxPool1d(1)
+        
+        SF = self.spec_size // (2 ** (len(self.depths) - 1)) // self.patch_stride[0] // self.freq_ratio
+        self.tscam_conv = nn.Conv2d(
+            in_channels = self.num_features,
+            out_channels = self.num_classes,
+            kernel_size = (SF,3),
+            padding = (0,1)
+        )
+        self.head = nn.Linear(num_classes, num_classes)
+
+        if (self.enable_fusion) and (self.fusion_type in ['daf_1d','aff_1d','iaff_1d']):
+            self.mel_conv1d = nn.Sequential(
+                nn.Conv1d(64, 64, kernel_size=5, stride=3, padding=2),
+                nn.BatchNorm1d(64)
+            )
+            if self.fusion_type == 'daf_1d':
+                self.fusion_model = DAF()
+            elif self.fusion_type == 'aff_1d':
+                self.fusion_model = AFF(channels=64, type='1D')
+            elif self.fusion_type == 'iaff_1d':
+                self.fusion_model = iAFF(channels=64, type='1D')
+                
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        pass
+        # if isinstance(m, nn.Linear):
+        #     trunc_normal_(m.weight, std=.02)
+        #     if isinstance(m, nn.Linear) and m.bias is not None:
+        #         nn.init.constant_(m.bias, 0)
+        # elif isinstance(m, nn.LayerNorm):
+        #     nn.init.constant_(m.bias, 0)
+        #     nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+
+
+    def forward_features(self, x, longer_idx = None):
+        # A deprecated optimization for using a hierarchical output from different blocks
+
+        frames_num = x.shape[2]        
+        x = self.patch_embed(x, longer_idx = longer_idx)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        for i, layer in enumerate(self.layers):
+            x, attn = layer(x)
+        # for x
+        x = self.norm(x)
+        B, N, C = x.shape
+        SF = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
+        ST = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
+        x = x.permute(0,2,1).contiguous().reshape(B, C, SF, ST)
+        B, C, F, T = x.shape
+        # group 2D CNN
+        c_freq_bin = F // self.freq_ratio
+        x = x.reshape(B, C, F // c_freq_bin, c_freq_bin, T)
+        x = x.permute(0,1,3,2,4).contiguous().reshape(B, C, c_freq_bin, -1)
+        # get latent_output
+        fine_grained_latent_output = torch.mean(x, dim = 2)
+        fine_grained_latent_output = interpolate(fine_grained_latent_output.permute(0,2,1).contiguous(), 8 * self.patch_stride[1]) 
+        
+        latent_output = self.avgpool(torch.flatten(x,2))
+        latent_output = torch.flatten(latent_output, 1)
+
+        # display the attention map, if needed
+
+        x = self.tscam_conv(x)
+        x = torch.flatten(x, 2) # B, C, T
+ 
+        fpx = interpolate(torch.sigmoid(x).permute(0,2,1).contiguous(), 8 * self.patch_stride[1]) 
+            
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+
+        output_dict = {
+            'framewise_output': fpx, # already sigmoided
+            'clipwise_output': torch.sigmoid(x),
+            'fine_grained_embedding': fine_grained_latent_output,
+            'embedding': latent_output
+        }
+
+        return output_dict
+
+    def crop_wav(self, x, crop_size, spe_pos = None):
+        time_steps = x.shape[2]
+        tx = torch.zeros(x.shape[0], x.shape[1], crop_size, x.shape[3]).to(x.device)
+        for i in range(len(x)):
+            if spe_pos is None:
+                crop_pos = random.randint(0, time_steps - crop_size - 1)
+            else:
+                crop_pos = spe_pos
+            tx[i][0] = x[i, 0, crop_pos:crop_pos + crop_size,:]
+        return tx
+
+    # Reshape the wavform to a img size, if you want to use the pretrained swin transformer model
+    def reshape_wav2img(self, x):
+        B, C, T, F = x.shape
+        target_T = int(self.spec_size * self.freq_ratio)
+        target_F = self.spec_size // self.freq_ratio
+        assert T <= target_T and F <= target_F, "the wav size should less than or equal to the swin input size"
+        # to avoid bicubic zero error
+        if T < target_T:
+            x = nn.functional.interpolate(x, (target_T, x.shape[3]), mode="bicubic", align_corners=True)
+        if F < target_F:
+            x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)
+        x = x.permute(0,1,3,2).contiguous()
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2], self.freq_ratio, x.shape[3] // self.freq_ratio)
+        # print(x.shape)
+        x = x.permute(0,1,3,2,4).contiguous()
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3], x.shape[4])
+        return x
+    
+    # Repeat the wavform to a img size, if you want to use the pretrained swin transformer model
+    def repeat_wat2img(self, x, cur_pos):
+        B, C, T, F = x.shape
+        target_T = int(self.spec_size * self.freq_ratio)
+        target_F = self.spec_size // self.freq_ratio
+        assert T <= target_T and F <= target_F, "the wav size should less than or equal to the swin input size"
+        # to avoid bicubic zero error
+        if T < target_T:
+            x = nn.functional.interpolate(x, (target_T, x.shape[3]), mode="bicubic", align_corners=True)
+        if F < target_F:
+            x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)  
+        x = x.permute(0,1,3,2).contiguous() # B C F T
+        x = x[:,:,:,cur_pos:cur_pos + self.spec_size]
+        x = x.repeat(repeats = (1,1,4,1))
+        return x
+
+    def forward(self, x: torch.Tensor, mixup_lambda = None, infer_mode = False, device=None):# out_feat_keys: List[str] = None):
+
+        if self.enable_fusion and x["longer"].sum() == 0:
+            # if no audio is longer than 10s, then randomly select one audio to be longer
+            x["longer"][torch.randint(0, x["longer"].shape[0], (1,))] = True
+
+        if not self.enable_fusion:
+            x = x["waveform"].to(device=device, non_blocking=True)
+            x = self.spectrogram_extractor(x)   # (batch_size, 1, time_steps, freq_bins)
+            x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+            x = x.transpose(1, 3)
+            x = self.bn0(x)
+            x = x.transpose(1, 3)
+            if self.training:
+                x = self.spec_augmenter(x)
+
+            if self.training and mixup_lambda is not None:
+                x = do_mixup(x, mixup_lambda)
+                
+            x = self.reshape_wav2img(x)
+            output_dict = self.forward_features(x)
+        else:
+            longer_list = x["longer"].to(device=device, non_blocking=True)
+            x = x["mel_fusion"].to(device=device, non_blocking=True)
+            x = x.transpose(1, 3)
+            x = self.bn0(x)
+            x = x.transpose(1, 3)
+            longer_list_idx = torch.where(longer_list)[0]
+            if self.fusion_type in ['daf_1d','aff_1d','iaff_1d']:
+                new_x = x[:,0:1,:,:].clone().contiguous()
+                if len(longer_list_idx) > 0:
+                # local processing
+                    fusion_x_local = x[longer_list_idx,1:,:,:].clone().contiguous()
+                    FB,FC,FT,FF = fusion_x_local.size()
+                    fusion_x_local = fusion_x_local.view(FB * FC, FT, FF)
+                    fusion_x_local = torch.permute(fusion_x_local, (0,2,1)).contiguous()
+                    fusion_x_local = self.mel_conv1d(fusion_x_local)
+                    fusion_x_local = fusion_x_local.view(FB,FC,FF,fusion_x_local.size(-1))
+                    fusion_x_local = torch.permute(fusion_x_local, (0,2,1,3)).contiguous().flatten(2)
+                    if fusion_x_local.size(-1) < FT:
+                        fusion_x_local = torch.cat([fusion_x_local, torch.zeros((FB,FF,FT- fusion_x_local.size(-1)), device=device)], dim=-1)
+                    else:
+                        fusion_x_local = fusion_x_local[:,:,:FT]
+                    # 1D fusion
+                    new_x = new_x.squeeze(1).permute((0,2,1)).contiguous()
+                    new_x[longer_list_idx] = self.fusion_model(new_x[longer_list_idx], fusion_x_local)
+                    x = new_x.permute((0,2,1)).contiguous()[:,None,:,:]
+                else:
+                    x = new_x
+
+            elif self.fusion_type in ['daf_2d','aff_2d','iaff_2d','channel_map']:
+                x = x # no change
+
+            if self.training:
+                x = self.spec_augmenter(x)
+            if self.training and mixup_lambda is not None:
+                x = do_mixup(x, mixup_lambda)
+
+            x = self.reshape_wav2img(x)
+            output_dict = self.forward_features(x, longer_idx = longer_list_idx)
+       
+
+        return output_dict
\ No newline at end of file

From 9b276aaf58462e5d75f057351b0d99422b6a5fe5 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 31 Jan 2023 10:59:21 +0000
Subject: [PATCH 005/197] some refactor

- `CLAPVision` to `CLAPAudio`
- refactor kwargs of audio modules
---
 docs/source/en/model_doc/clap.mdx             |   4 +-
 src/transformers/__init__.py                  |   4 +-
 src/transformers/models/clap/__init__.py      |   4 +-
 .../models/clap/configuration_clap.py         | 108 +--
 .../models/clap/feature_extraction_clap.py    |   1 +
 src/transformers/models/clap/modeling_clap.py | 733 +++++++++---------
 .../models/clap/processing_clap.py            |  13 +-
 tests/models/clap/test_modeling_clap.py       |  10 +-
 8 files changed, 448 insertions(+), 429 deletions(-)

diff --git a/docs/source/en/model_doc/clap.mdx b/docs/source/en/model_doc/clap.mdx
index e321c98b8674..5e49090a0ebd 100644
--- a/docs/source/en/model_doc/clap.mdx
+++ b/docs/source/en/model_doc/clap.mdx
@@ -38,9 +38,9 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 [[autodoc]] CLAPTextConfig
 
-## CLAPVisionConfig
+## CLAPAudioConfig
 
-[[autodoc]] CLAPVisionConfig
+[[autodoc]] CLAPAudioConfig
 
 ## CLAPTokenizer
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 68180ccadf08..d6e9889db44c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -204,11 +204,11 @@
     ],
     "models.clap": [
         "CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CLAPAudioConfig",
         "CLAPConfig",
         "CLAPProcessor",
         "CLAPTextConfig",
         "CLAPTokenizer",
-        "CLAPVisionConfig",
     ],
     "models.clip": [
         "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -3668,11 +3668,11 @@
     )
     from .models.clap import (
         CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CLAPAudioConfig,
         CLAPConfig,
         CLAPProcessor,
         CLAPTextConfig,
         CLAPTokenizer,
-        CLAPVisionConfig,
     )
     from .models.clip import (
         CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index a3a0c6882ccf..e6dd2d384758 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -29,10 +29,10 @@
 _import_structure = {
     "configuration_clap": [
         "CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CLAPAudioConfig",
         "CLAPConfig",
         "CLAPOnnxConfig",
         "CLAPTextConfig",
-        "CLAPVisionConfig",
     ],
     "processing_clap": ["CLAPProcessor"],
     "tokenization_clap": ["CLAPTokenizer"],
@@ -74,10 +74,10 @@
 if TYPE_CHECKING:
     from .configuration_clap import (
         CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CLAPAudioConfig,
         CLAPConfig,
         CLAPOnnxConfig,
         CLAPTextConfig,
-        CLAPVisionConfig,
     )
     from .processing_clap import CLAPProcessor
     from .tokenization_clap import CLAPTokenizer
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index c4682923d524..f63358a2a2f1 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -16,16 +16,9 @@
 
 import copy
 import os
-from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
-
-
-if TYPE_CHECKING:
-    from ...processing_utils import ProcessorMixin
-    from ...utils import TensorType
+from typing import Union
 
 from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfig
 from ...utils import logging
 
 
@@ -149,9 +142,6 @@ def __init__(
         self.classifier_dropout = classifier_dropout
         self.projection_hidden_size = projection_hidden_size
 
-
-
-
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
 
@@ -170,7 +160,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         return cls.from_dict(config_dict, **kwargs)
 
 
-class CLAPVisionConfig(PretrainedConfig):
+class CLAPAudioConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`CLAPVisionModel`]. It is used to instantiate a
     CLAP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
@@ -195,8 +185,8 @@ class CLAPVisionConfig(PretrainedConfig):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"relu"`,
-            `"relu"`, `"selu"` and `"relu_new"` ``"relu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
+            `"relu"`, `"selu"` and `"relu_new"` ``"relu"` are supported. layer_norm_eps (`float`, *optional*, defaults
+            to 1e-5): The epsilon used by the layer normalization layers.
         dropout (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -210,10 +200,10 @@ class CLAPVisionConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import CLAPVisionConfig, CLAPVisionModel
+    >>> from transformers import CLAPAudioConfig, CLAPVisionModel
 
-    >>> # Initializing a CLAPVisionConfig with laion-ai/base style configuration
-    >>> configuration = CLAPVisionConfig()
+    >>> # Initializing a CLAPAudioConfig with laion-ai/base style configuration
+    >>> configuration = CLAPAudioConfig()
 
     >>> # Initializing a CLAPVisionModel (with random weights) from the laion-ai/base style configuration
     >>> model = CLAPVisionModel(configuration)
@@ -223,29 +213,44 @@ class CLAPVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "clap_vision_model"
+
     def __init__(
         self,
-        sample_rate = 48000,
-        audio_length = 1024,
-        window_size = 8,
-        hop_size = 1024,
-        fmin = 50,
-        fmax = 14000,
-        class_num = 527,
-        mel_bins = 64,
-        clip_samples = 480000,
+        sample_rate=48000,
+        audio_length=1024,
+        window_size=8,
+        hop_size=1024,
+        fmin=50,
+        fmax=14000,
+        mel_bins=64,
+        clip_samples=480000,
         spec_size=256,
         hidden_act="relu",
         patch_size=4,
-        patch_stride=(4,4),
+        patch_stride=(4, 4),
         num_classes=527,
         hidden_size=96,
         projection_hidden_size=768,
-        depths=[2,2,6,2],
-        num_heads=[4,8,16,32],
-        enable_fusion=True,
+        depths=[2, 2, 6, 2],
+        num_heads=[4, 8, 16, 32],
+        enable_fusion=False,
         hidden_dropout_prob=0.1,
         fusion_type=None,
+        image_size=224,
+        input_channels=3,
+        patch_embed_input_channels=1,
+        flatten_patch_embeds=True,
+        patch_embeds_hidden_size=96,
+        enable_patch_layer_norm=True,
+        swin_drop_rate=0.0,
+        swin_attention_drop_rate=0.0,
+        swin_drop_path_rate=0.1,
+        swin_qkv_bias=True,
+        swin_norm_before_mlp="ln",
+        swin_mlp_ratio=4.0,
+        swin_use_checkpoint=False,
+        swin_absolute_positional_embedding=False,
+        swin_hidden_act="gelu",
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -255,7 +260,6 @@ def __init__(
         self.hop_size = hop_size
         self.fmin = fmin
         self.fmax = fmax
-        self.class_num = class_num
         self.mel_bins = mel_bins
         self.clip_samples = clip_samples
         self.spec_size = spec_size
@@ -271,8 +275,22 @@ def __init__(
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
         self.projection_hidden_size = projection_hidden_size
+        self.image_size = image_size
+        self.input_channels = input_channels
+        self.flatten_patch_embeds = flatten_patch_embeds
+        self.patch_embeds_hidden_size = patch_embeds_hidden_size
+        self.enable_patch_layer_norm = enable_patch_layer_norm
+        self.swin_drop_rate = swin_drop_rate
+        self.swin_attention_drop_rate = swin_attention_drop_rate
+        self.swin_drop_path_rate = swin_drop_path_rate
+        self.swin_qkv_bias = swin_qkv_bias
+        self.swin_norm_before_mlp = swin_norm_before_mlp
+        self.swin_mlp_ratio = swin_mlp_ratio
+        self.swin_use_checkpoint = swin_use_checkpoint
+        self.swin_absolute_positional_embedding = swin_absolute_positional_embedding
+        self.patch_embed_input_channels = patch_embed_input_channels
+        self.swin_hidden_act = swin_hidden_act
 
-        
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
 
@@ -305,7 +323,7 @@ class CLAPConfig(PretrainedConfig):
         text_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`CLAPTextConfig`].
         vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`CLAPVisionConfig`].
+            Dictionary of configuration options used to initialize [`CLAPAudioConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
             Dimentionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
@@ -327,12 +345,12 @@ class CLAPConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
 
-    >>> # We can also initialize a CLAPConfig from a CLAPTextConfig and a CLAPVisionConfig
-    >>> from transformers import CLAPTextConfig, CLAPVisionConfig
+    >>> # We can also initialize a CLAPConfig from a CLAPTextConfig and a CLAPAudioConfig
+    >>> from transformers import CLAPTextConfig, CLAPAudioConfig
 
     >>> # Initializing a CLAPText and CLAPVision configuration
     >>> config_text = CLAPTextConfig()
-    >>> config_vision = CLAPVisionConfig()
+    >>> config_vision = CLAPAudioConfig()
 
     >>> config = CLAPConfig.from_text_vision_configs(config_text, config_vision)
     ```"""
@@ -341,12 +359,12 @@ class CLAPConfig(PretrainedConfig):
     is_composition = True
 
     def __init__(
-        self, 
-        text_config=None, 
-        vision_config=None, 
-        logit_scale_init_value=2.6592, 
+        self,
+        text_config=None,
+        vision_config=None,
+        logit_scale_init_value=2.6592,
         fusion_num_hidden_layers=2,
-        projection_dim=512, 
+        projection_dim=512,
         projection_hidden_act="relu",
         **kwargs
     ):
@@ -366,10 +384,10 @@ def __init__(
 
         if vision_config is None:
             vision_config = {}
-            logger.info("vision_config is None. initializing the CLAPVisionConfig with default values.")
+            logger.info("vision_config is None. initializing the CLAPAudioConfig with default values.")
 
         self.text_config = CLAPTextConfig(**text_config)
-        self.vision_config = CLAPVisionConfig(**vision_config)
+        self.vision_config = CLAPAudioConfig(**vision_config)
 
         self.text_config.fusion_num_hidden_layers = fusion_num_hidden_layers
         self.vision_config.fusion_num_hidden_layers = fusion_num_hidden_layers
@@ -388,7 +406,7 @@ def __init__(
         self.initializer_factor = 1.0
 
     @classmethod
-    def from_text_vision_configs(cls, text_config: CLAPTextConfig, vision_config: CLAPVisionConfig, **kwargs):
+    def from_text_vision_configs(cls, text_config: CLAPTextConfig, vision_config: CLAPAudioConfig, **kwargs):
         r"""
         Instantiate a [`CLAPConfig`] (or a derived class) from clap text model configuration and clap vision model
         configuration.
@@ -410,4 +428,4 @@ def to_dict(self):
         output["text_config"] = self.text_config.to_dict()
         output["vision_config"] = self.vision_config.to_dict()
         output["model_type"] = self.__class__.model_type
-        return output
\ No newline at end of file
+        return output
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index cf31ef053127..d43b4ea9b21d 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -26,6 +26,7 @@
 
 logger = logging.get_logger(__name__)
 
+
 # Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor with Whisper->CLAP
 class CLAPFeatureExtractor(SequenceFeatureExtractor):
     r"""
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index c4288794362e..939723f68139 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -15,22 +15,26 @@
 """ PyTorch CLAP model."""
 import collections
 import math
-
+import random
 from dataclasses import dataclass
-from typing import Any, Optional, Tuple, Union, List
-import numpy as np
-
-
 from itertools import repeat
+from typing import Any, List, Optional, Tuple, Union
+
+import numpy as np
 import torch
-import torch.utils.checkpoint
 import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
 from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
 
 from ...activations import ACT2FN
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -38,12 +42,7 @@
     logging,
     replace_return_docstrings,
 )
-from ...activations import ACT2FN, gelu
-from ...modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
-)
-from .configuration_clap import CLAPConfig, CLAPTextConfig, CLAPVisionConfig
+from .configuration_clap import CLAPAudioConfig, CLAPConfig, CLAPTextConfig
 
 
 logger = logging.get_logger(__name__)
@@ -56,6 +55,36 @@
 ]
 
 
+def do_mixup(x, mixup_lambda):
+    """
+    Args:
+      x: (batch_size , ...)
+      mixup_lambda: (batch_size,)
+    Returns:
+      out: (batch_size, ...)
+    """
+    out = (
+        x.transpose(0, -1) * mixup_lambda + torch.flip(x, dims=[0]).transpose(0, -1) * (1 - mixup_lambda)
+    ).transpose(0, -1)
+    return out
+
+
+def interpolate(x, ratio):
+    """Interpolate data in time domain. This is used to compensate the
+    resolution reduction in downsampling of a CNN.
+
+    Args:
+      x: (batch_size, time_steps, classes_num)
+      ratio: int, ratio to interpolate
+    Returns:
+      upsampled: (batch_size, time_steps * ratio, classes_num)
+    """
+    (batch_size, time_steps, classes_num) = x.shape
+    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+    return upsampled
+
+
 # Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
 def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
     """
@@ -73,7 +102,6 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l
     return incremental_indices.long() + padding_idx
 
 
-
 @dataclass
 # Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->CLAP
 class CLAPTextModelOutput(ModelOutput):
@@ -142,15 +170,16 @@ def to_tuple(self) -> Tuple[Any]:
         )
 
 
-
 # from PyTorch internals
 def _ntuple(n):
     def parse(x):
         if isinstance(x, collections.abc.Iterable):
             return x
         return tuple(repeat(x, n))
+
     return parse
 
+
 to_1tuple = _ntuple(1)
 to_2tuple = _ntuple(2)
 to_3tuple = _ntuple(3)
@@ -158,15 +187,14 @@ def parse(x):
 to_ntuple = _ntuple
 
 
-def drop_path(x, drop_prob: float = 0., training: bool = False):
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-    'survival rate' as the argument.
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, the original name is
+    misleading as 'Drop Connect' is a different form of dropout in a separate paper... See discussion:
+    https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the layer and
+    argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the argument.
     """
-    if drop_prob == 0. or not training:
+    if drop_prob == 0.0 or not training:
         return x
     keep_prob = 1 - drop_prob
     shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
@@ -176,9 +204,52 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
     return output
 
 
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+class CLAPAudioAFFBlock(nn.Module):
+    r"""
+    TODO: add docstring
     """
+
+    def __init__(self, channels=64, r=4):
+        super(CLAPAudioAFFBlock, self).__init__()
+        inter_channels = int(channels // r)
+
+        self.local_att = nn.Sequential(
+            nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(inter_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(channels),
+        )
+        self.global_att = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(inter_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(channels),
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x, residual):
+        flag = False
+        xa = x + residual
+        if xa.size(0) == 1:
+            xa = torch.cat([xa, xa], dim=0)
+            flag = True
+        xl = self.local_att(xa)
+        xg = self.global_att(xa)
+        xlg = xl + xg
+        wei = self.sigmoid(xlg)
+        xo = 2 * x * wei + 2 * residual * (1 - wei)
+        if flag:
+            xo = xo[0].unsqueeze(0)
+        return xo
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
     def __init__(self, drop_prob=None):
         super(DropPath, self).__init__()
         self.drop_prob = drop_prob
@@ -186,92 +257,101 @@ def __init__(self, drop_prob=None):
     def forward(self, x):
         return drop_path(x, self.drop_prob, self.training)
 
-class PatchEmbed(nn.Module):
-    """ 2D Image to Patch Embedding
-    """
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, hidden_size=768, norm_layer=None, flatten=True, patch_stride = 16,
-        enable_fusion=False, fusion_type='None'):
+
+class CLAPAudioPatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    def __init__(self, config: CLAPAudioConfig):
         super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patch_stride = to_2tuple(patch_stride)
+        img_size = to_2tuple(config.spec_size)
+        patch_size = to_2tuple(config.patch_size)
+        patch_stride = to_2tuple(config.patch_stride)
+
         self.img_size = img_size
-        self.patch_size = patch_size
         self.patch_stride = patch_stride
+
         self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1])
         self.num_patches = self.grid_size[0] * self.grid_size[1]
-        self.flatten = flatten
-        self.in_chans = in_chans
-        self.hidden_size = hidden_size
+        self.flatten = config.flatten_patch_embeds
+        self.enable_fusion = config.enable_patch_layer_norm
+        self.fusion_type = config.fusion_type
 
-        self.enable_fusion = enable_fusion
-        self.fusion_type = fusion_type
-        
         padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)
 
-        if (self.enable_fusion) and (self.fusion_type == 'channel_map'):
-            self.proj = nn.Conv2d(in_chans*4, hidden_size, kernel_size=patch_size, stride=patch_stride, padding=padding)
+        if (self.enable_fusion) and (self.fusion_type == "channel_map"):
+            self.proj = nn.Conv2d(
+                config.patch_embed_input_channels * 4,
+                config.patch_embeds_hidden_size,
+                kernel_size=patch_size,
+                stride=patch_stride,
+                padding=padding,
+            )
         else:
-            self.proj = nn.Conv2d(in_chans, hidden_size, kernel_size=patch_size, stride=patch_stride, padding=padding)
-        self.norm = norm_layer(hidden_size) if norm_layer else nn.Identity()
-
-        if (self.enable_fusion) and (self.fusion_type in ['daf_2d','aff_2d','iaff_2d']):
-            self.mel_conv2d = nn.Conv2d(in_chans, hidden_size, kernel_size=(patch_size[0], patch_size[1]*3), stride=(patch_stride[0], patch_stride[1] * 3), padding=padding)
-            if self.fusion_type == 'daf_2d':
-                self.fusion_model = DAF()
-            elif self.fusion_type == 'aff_2d':
-                self.fusion_model = AFF(channels=hidden_size, type='2D')
-            elif self.fusion_type == 'iaff_2d':
-                self.fusion_model = iAFF(channels=hidden_size, type='2D')    
-    def forward(self, x, longer_idx = None):
-        if (self.enable_fusion) and (self.fusion_type in ['daf_2d','aff_2d','iaff_2d']):
-            global_x = x[:,0:1,:,:]
-            
+            self.proj = nn.Conv2d(
+                config.patch_embed_input_channels,
+                config.patch_embeds_hidden_size,
+                kernel_size=patch_size,
+                stride=patch_stride,
+                padding=padding,
+            )
+
+        self.norm = nn.LayerNorm(config.patch_embeds_hidden_size) if config.enable_patch_layer_norm else nn.Identity()
+        if self.enable_fusion:
+            self.fusion_model = CLAPAudioAFFBlock(channels=config.patch_embeds_hidden_size)
+
+    def forward(self, x, longer_idx=None):
+        if self.enable_fusion:
+            global_x = x[:, 0:1, :, :]
 
             # global processing
             B, C, H, W = global_x.shape
-            assert H == self.img_size[0] and W == self.img_size[1], \
-                f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+            assert (
+                H == self.img_size[0] and W == self.img_size[1]
+            ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
             global_x = self.proj(global_x)
             TW = global_x.size(-1)
             if len(longer_idx) > 0:
                 # local processing
-                local_x = x[longer_idx,1:,:,:].contiguous()
+                local_x = x[longer_idx, 1:, :, :].contiguous()
                 B, C, H, W = local_x.shape
-                local_x = local_x.view(B*C,1,H,W)
+                local_x = local_x.view(B * C, 1, H, W)
                 local_x = self.mel_conv2d(local_x)
-                local_x = local_x.view(B,C,local_x.size(1),local_x.size(2),local_x.size(3))
-                local_x = local_x.permute((0,2,3,1,4)).contiguous().flatten(3)
-                TB,TC,TH,_ = local_x.size()
+                local_x = local_x.view(B, C, local_x.size(1), local_x.size(2), local_x.size(3))
+                local_x = local_x.permute((0, 2, 3, 1, 4)).contiguous().flatten(3)
+                TB, TC, TH, _ = local_x.size()
                 if local_x.size(-1) < TW:
-                    local_x = torch.cat([local_x, torch.zeros((TB,TC,TH,TW-local_x.size(-1)), device=global_x.device)], dim=-1)
+                    local_x = torch.cat(
+                        [local_x, torch.zeros((TB, TC, TH, TW - local_x.size(-1)), device=global_x.device)], dim=-1
+                    )
                 else:
-                    local_x = local_x[:,:,:,:TW]
-                
-                global_x[longer_idx] = self.fusion_model(global_x[longer_idx],local_x)
+                    local_x = local_x[:, :, :, :TW]
+
+                global_x[longer_idx] = self.fusion_model(global_x[longer_idx], local_x)
             x = global_x
         else:
             B, C, H, W = x.shape
-            assert H == self.img_size[0] and W == self.img_size[1], \
-                f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+            assert (
+                H == self.img_size[0] and W == self.img_size[1]
+            ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
             x = self.proj(x)
-        
+
         if self.flatten:
             x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
         x = self.norm(x)
         return x
 
-class Mlp(nn.Module):
-    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
-    """
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+
+class CLAPAudioMLP(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(self, in_features, hidden_features=None, out_features=None, config=None):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
         self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
+        self.act = ACT2FN[config.swin_hidden_act]
         self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
+        self.drop = nn.Dropout(config.swin_drop_rate)
 
     def forward(self, x):
         x = self.fc1(x)
@@ -281,17 +361,13 @@ def forward(self, x):
         x = self.drop(x)
         return x
 
+
 def _no_grad_trunc_normal_(tensor, mean, std, a, b):
     # Cut & paste from PyTorch official master until it's in a few official releases - RW
     # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
     def norm_cdf(x):
         # Computes standard normal cumulative distribution function
-        return (1. + math.erf(x / math.sqrt(2.))) / 2.
-
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-                      "The distribution of values may be incorrect.",
-                      stacklevel=2)
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
 
     with torch.no_grad():
         # Values are generated by using a truncated uniform distribution and
@@ -309,7 +385,7 @@ def norm_cdf(x):
         tensor.erfinv_()
 
         # Transform to proper mean, std
-        tensor.mul_(std * math.sqrt(2.))
+        tensor.mul_(std * math.sqrt(2.0))
         tensor.add_(mean)
 
         # Clamp to ensure it's in the proper range
@@ -317,41 +393,38 @@ def norm_cdf(x):
         return tensor
 
 
-def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
     # type: (Tensor, float, float, float, float) -> Tensor
     r"""Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
     Args:
+    normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean},
+    \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for
+    generating the random values works best when :math:`a \leq \text{mean} \leq b`.
         tensor: an n-dimensional `torch.Tensor`
         mean: the mean of the normal distribution
         std: the standard deviation of the normal distribution
         a: the minimum cutoff value
         b: the maximum cutoff value
     Examples:
-        >>> w = torch.empty(3, 5)
-        >>> nn.init.trunc_normal_(w)
+        >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w)
     """
     return _no_grad_trunc_normal_(tensor, mean, std, a, b)
 
 
-def variance_scaling_(tensor, scale=1.0, mode='fan_in', distribution='normal'):
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
     fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
-    if mode == 'fan_in':
+    if mode == "fan_in":
         denom = fan_in
-    elif mode == 'fan_out':
+    elif mode == "fan_out":
         denom = fan_out
-    elif mode == 'fan_avg':
+    elif mode == "fan_avg":
         denom = (fan_in + fan_out) / 2
 
     variance = scale / denom
 
     if distribution == "truncated_normal":
         # constant is stddev of standard normal truncated to (-2, 2)
-        trunc_normal_(tensor, std=math.sqrt(variance) / .87962566103423978)
+        trunc_normal_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
     elif distribution == "normal":
         tensor.normal_(std=math.sqrt(variance))
     elif distribution == "uniform":
@@ -362,7 +435,8 @@ def variance_scaling_(tensor, scale=1.0, mode='fan_in', distribution='normal'):
 
 
 def lecun_normal_(tensor):
-    variance_scaling_(tensor, mode='fan_in', distribution='truncated_normal')
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
 
 def window_partition(x, window_size):
     """
@@ -394,31 +468,20 @@ def window_reverse(windows, window_size, H, W):
     return x
 
 
-class WindowAttention(nn.Module):
-    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
-    Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-    """
-
-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+class CLAPAudioWindowAttention(nn.Module):
+    def __init__(self, dim, window_size, num_heads, config=None):
 
         super().__init__()
         self.dim = dim
         self.window_size = window_size  # Wh, Ww
         self.num_heads = num_heads
         head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
+        self.scale = head_dim**-0.5
 
         # define a parameter table of relative position bias
         self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
 
         # get pair-wise relative position index for each token inside the window
         coords_h = torch.arange(self.window_size[0])
@@ -433,12 +496,12 @@ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, at
         relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
         self.register_buffer("relative_position_index", relative_position_index)
 
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
+        self.qkv = nn.Linear(dim, dim * 3, bias=config.swin_qkv_bias)
+        self.attn_drop = nn.Dropout(config.swin_attention_drop_rate)
         self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
+        self.proj_drop = nn.Dropout(config.swin_drop_rate)
 
-        trunc_normal_(self.relative_position_bias_table, std=.02)
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
         self.softmax = nn.Softmax(dim=-1)
 
     def forward(self, x, mask=None):
@@ -452,10 +515,11 @@ def forward(self, x, mask=None):
         q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
 
         q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
+        attn = q @ k.transpose(-2, -1)
 
         relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )  # Wh*Ww,Wh*Ww,nH
         relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
         attn = attn + relative_position_bias.unsqueeze(0)
 
@@ -475,69 +539,63 @@ def forward(self, x, mask=None):
         return x, attn
 
     def extra_repr(self):
-        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
-
+        return f"dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}"
 
-class SwinTransformerBlock(nn.Module):
-    r""" Swin Transformer Block.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resulotion.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        shift_size (int): Shift size for SW-MSA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
 
-    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
-                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_before_mlp='ln'):
+class CLAPAudioSwinTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_dim,
+        input_resolution,
+        num_heads,
+        shift_size=0,
+        drop_path=0.0,
+        config=None,
+    ):
         super().__init__()
-        self.dim = dim
+        self.hidden_dim = hidden_dim
         self.input_resolution = input_resolution
         self.num_heads = num_heads
-        self.window_size = window_size
+        self.window_size = config.window_size
         self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        self.norm_before_mlp = norm_before_mlp
+        self.mlp_ratio = config.swin_mlp_ratio
+        self.norm_before_mlp = config.swin_norm_before_mlp
+
         if min(self.input_resolution) <= self.window_size:
             # if window size is larger than input resolution, we don't partition windows
             self.shift_size = 0
             self.window_size = min(self.input_resolution)
         assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
 
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention(
-            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
-            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.attn = CLAPAudioWindowAttention(
+            hidden_dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, config=config
+        )
 
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        if self.norm_before_mlp == 'ln':
-            self.norm2 = nn.LayerNorm(dim)
-        elif self.norm_before_mlp == 'bn':
-            self.norm2 = lambda x: nn.BatchNorm1d(dim)(x.transpose(1, 2)).transpose(1, 2)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        if self.norm_before_mlp == "ln":
+            self.norm2 = nn.LayerNorm(hidden_dim)
+        elif self.norm_before_mlp == "bn":
+            self.norm2 = lambda x: nn.BatchNorm1d(hidden_dim)(x.transpose(1, 2)).transpose(1, 2)
         else:
             raise NotImplementedError
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        mlp_hidden_dim = int(hidden_dim * self.mlp_ratio)
+        self.mlp = CLAPAudioMLP(in_features=hidden_dim, hidden_features=mlp_hidden_dim, config=config)
 
         if self.shift_size > 0:
             # calculate attention mask for SW-MSA
             H, W = self.input_resolution
             img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
-            h_slices = (slice(0, -self.window_size),
-                        slice(-self.window_size, -self.shift_size),
-                        slice(-self.shift_size, None))
-            w_slices = (slice(0, -self.window_size),
-                        slice(-self.window_size, -self.shift_size),
-                        slice(-self.shift_size, None))
+            h_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            w_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
             cnt = 0
             for h in h_slices:
                 for w in w_slices:
@@ -597,8 +655,10 @@ def forward(self, x):
         return x, attn
 
     def extra_repr(self):
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
-               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+        return (
+            f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
+            f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+        )
 
 
 # contrastive loss function, adapted from
@@ -751,7 +811,7 @@ def __init__(self, config: CLAPTextConfig):
         hidden_size = config.projection_dim
         self.activation = ACT2FN[config.hidden_act]
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        
+
         self.linear = nn.Linear(hidden_size, hidden_size)
 
     def forward(self, hidden_states):
@@ -767,7 +827,7 @@ def __init__(self, config: CLAPTextConfig):
         self.config = config
         hidden_size = config.projection_hidden_size
         projection_dim = config.projection_dim
-        
+
         self.linear1 = nn.Linear(hidden_size, projection_dim)
         self.activation = ACT2FN[config.projection_hidden_act]
         self.linear2 = nn.Linear(projection_dim, projection_dim)
@@ -783,7 +843,7 @@ class CLAPFusionLayer(nn.Module):
     def __init__(self, config: CLAPTextConfig):
         super().__init__()
         self.config = config
-        
+
         self.layers = nn.ModuleList([CLAPFusionBlock(config) for _ in range(config.fusion_num_hidden_layers)])
 
     def forward(self, hidden_states):
@@ -1311,13 +1371,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return pooled_output
 
 
-
-
 class CLAPTextPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
+
     config_class = CLAPTextConfig
     base_model_prefix = "claptext"
     supports_gradient_checkpointing = True
@@ -1530,7 +1589,6 @@ def forward(
         )
 
 
-
 # Copied from transformers.models.clip.modeling_clip.CLIPPreTrainedModel with CLIP->CLAP,clip->clap
 class CLAPPreTrainedModel(PreTrainedModel):
     """
@@ -1597,10 +1655,9 @@ def _init_weights(self, module):
         # if isinstance(module, nn.Linear) and module.bias is not None:
         #     module.bias.data.zero_()
 
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, CLAPEncoder):
-            module.gradient_checkpointing = value
-
+    # def _set_gradient_checkpointing(self, module, value=False):
+    #     if isinstance(module, CLAPEncoder):
+    #         module.gradient_checkpointing = value
 
 
 @add_start_docstrings(CLAP_START_DOCSTRING)
@@ -1616,9 +1673,9 @@ def __init__(self, config: CLAPConfig):
                 f" {type(config.text_config)}."
             )
 
-        if not isinstance(config.vision_config, CLAPVisionConfig):
+        if not isinstance(config.vision_config, CLAPAudioConfig):
             raise ValueError(
-                "config.vision_config is expected to be of type CLAPVisionConfig but is of type"
+                "config.vision_config is expected to be of type CLAPAudioConfig but is of type"
                 f" {type(config.vision_config)}."
             )
 
@@ -1632,14 +1689,13 @@ def __init__(self, config: CLAPConfig):
         self.text_hidden_size = text_config.hidden_size
         self.vision_hidden_size = vision_config.hidden_size
 
-
         self.text_model = CLAPTextModel(text_config)
         self.text_transform = CLAPFusionLayer(text_config)
-        self.text_projection = CLAPProjectionLayer(text_config) 
+        self.text_projection = CLAPProjectionLayer(text_config)
 
-        self.audio_model = HTSAT_Swin_Transformer(config=vision_config)
+        self.audio_model = CLAPSwinTransformer(config=vision_config)
         self.audio_transform = CLAPFusionLayer(vision_config)
-        self.audio_projection = CLAPProjectionLayer(vision_config) 
+        self.audio_projection = CLAPProjectionLayer(vision_config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1834,6 +1890,7 @@ def forward(
             vision_model_output=vision_outputs,
         )
 
+
 @add_start_docstrings(
     """
     CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
@@ -1910,8 +1967,8 @@ def forward(
         )
 
 
-class BasicLayer(nn.Module):
-    """ A basic Swin Transformer layer for one stage.
+class CLAPAudioLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
     Args:
         dim (int): Number of input channels.
         input_resolution (tuple[int]): Input resolution.
@@ -1929,32 +1986,47 @@ class BasicLayer(nn.Module):
         use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
     """
 
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
-                 norm_before_mlp='ln'):
-
+    def __init__(self, config, idx_layer=0, patches_resolution=0):
         super().__init__()
-        self.dim = dim
+
+        hidden_dim = config.hidden_size * 2**idx_layer
+        input_resolution = (patches_resolution[0] // (2**idx_layer), patches_resolution[1] // (2**idx_layer))
+        depth = config.depths[idx_layer]
+        num_heads = config.num_heads[idx_layer]
+        window_size = config.window_size
+        norm_layer = nn.LayerNorm if config.enable_patch_layer_norm else None
+
+        use_checkpoint = config.swin_use_checkpoint
+        downsample = CLAPAudioPatchMerging if (idx_layer < len(config.depths) - 1) else None
+
+        dpr = [
+            x.item() for x in torch.linspace(0, config.swin_drop_path_rate, sum(config.depths))
+        ]  # stochastic depth decay rule
+        drop_path = dpr[sum(config.depths[:idx_layer]) : sum(config.depths[: idx_layer + 1])]
+
+        # self.dim = dim
         self.input_resolution = input_resolution
         self.depth = depth
         self.use_checkpoint = use_checkpoint
 
         # build blocks
-        self.blocks = nn.ModuleList([
-            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
-                                 num_heads=num_heads, window_size=window_size,
-                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
-                                 mlp_ratio=mlp_ratio,
-                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
-                                 drop=drop, attn_drop=attn_drop,
-                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                                 norm_layer=norm_layer, norm_before_mlp=norm_before_mlp)
-            for i in range(depth)])
+        self.blocks = nn.ModuleList(
+            [
+                CLAPAudioSwinTransformerBlock(
+                    hidden_dim=hidden_dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    config=config,
+                )
+                for i in range(depth)
+            ]
+        )
 
         # patch merging layer
         if downsample is not None:
-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+            self.downsample = downsample(input_resolution, dim=hidden_dim, norm_layer=norm_layer)
         else:
             self.downsample = None
 
@@ -1970,12 +2042,13 @@ def forward(self, x):
         if self.downsample is not None:
             x = self.downsample(x)
         if not self.training:
-            attn = torch.cat(attns, dim = 0)
-            attn = torch.mean(attn, dim = 0)
+            attn = torch.cat(attns, dim=0)
+            attn = torch.mean(attn, dim=0)
         return x, attn
 
-class PatchMerging(nn.Module):
-    r""" Patch Merging Layer.
+
+class CLAPAudioPatchMerging(nn.Module):
+    r"""Patch Merging Layer.
     Args:
         input_resolution (tuple[int]): Resolution of input feature.
         dim (int): Number of input channels.
@@ -2016,155 +2089,76 @@ def extra_repr(self):
         return f"input_resolution={self.input_resolution}, dim={self.dim}"
 
 
-
-
 # The Core of HTSAT
-class HTSAT_Swin_Transformer(nn.Module):
-    r"""HTSAT based on the Swin Transformer
-    Args:
-        spec_size (int | tuple(int)): Input Spectrogram size. Default 256
-        patch_size (int | tuple(int)): Patch size. Default: 4
-        path_stride (iot | tuple(int)): Patch Stride for Frequency and Time Axis. Default: 4
-        in_chans (int): Number of input image channels. Default: 1 (mono)
-        num_classes (int): Number of classes for classification head. Default: 527
-        hidden_size (int): Patch embedding dimension. Default: 96
-        depths (tuple(int)): Depth of each HTSAT-Swin Transformer layer.
-        num_heads (tuple(int)): Number of attention heads in different layers.
-        window_size (int): Window size. Default: 8
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
-        drop_rate (float): Dropout rate. Default: 0
-        attn_drop_rate (float): Attention dropout rate. Default: 0
-        drop_path_rate (float): Stochastic depth rate. Default: 0.1
-        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
-        patch_norm (bool): If True, add normalization after patch embedding. Default: True
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
-        config (module): The configuration Module from config.py
-    """
-
-    def __init__(self, spec_size=256, patch_size=4, patch_stride=(4,4), 
-                in_chans=1, num_classes=527,
-                 hidden_size=96, depths=[2, 2, 6, 2], num_heads=[4, 8, 16, 32],
-                 window_size=8, mlp_ratio=4., qkv_bias=True, qk_scale=None,
-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
-                 norm_layer=nn.LayerNorm, 
-                 ape=False, patch_norm=True,
-                 use_checkpoint=False, norm_before_mlp='ln', config = None, 
-                 enable_fusion = False, fusion_type = 'None', **kwargs):
-        super(HTSAT_Swin_Transformer, self).__init__()
+class CLAPSwinTransformer(nn.Module):
+    def __init__(self, config: CLAPAudioConfig):
+        super(CLAPSwinTransformer, self).__init__()
 
         self.config = config
-        self.spec_size = spec_size 
-        self.patch_stride = patch_stride
-        self.patch_size = patch_size
-        self.window_size = window_size
-        self.hidden_size = hidden_size
-        self.depths = depths
-        self.ape = ape
-        self.in_chans = in_chans
-        self.num_classes = num_classes
-        self.num_heads = num_heads
+        self.spec_size = config.spec_size
+        self.patch_stride = config.patch_stride
+        self.window_size = config.window_size
+        self.hidden_size = config.hidden_size
+        self.depths = config.depths
+        self.use_absolute_pos_embedding = config.swin_absolute_positional_embedding
+        self.in_chans = config.input_channels
+        self.num_classes = config.num_classes
+        self.num_heads = config.num_heads
         self.num_layers = len(self.depths)
         self.num_features = int(self.hidden_size * 2 ** (self.num_layers - 1))
-        
-        self.drop_rate = drop_rate
-        self.attn_drop_rate = attn_drop_rate
-        self.drop_path_rate = drop_path_rate
 
-        self.qkv_bias = qkv_bias
-        self.qk_scale = None
+        self.drop_rate = config.swin_drop_rate
+        self.attn_drop_rate = config.swin_attention_drop_rate
+        self.drop_path_rate = config.swin_drop_path_rate
 
-        self.patch_norm = patch_norm
-        self.norm_layer = norm_layer if self.patch_norm else None
-        self.norm_before_mlp = norm_before_mlp
-        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = config.swin_qkv_bias
 
-        self.use_checkpoint = use_checkpoint
+        self.patch_norm = nn.LayerNorm if config.enable_patch_layer_norm else None
+        self.norm_layer = nn.LayerNorm if self.patch_norm else None
+        self.norm_before_mlp = config.swin_norm_before_mlp
+        self.mlp_ratio = config.swin_mlp_ratio
+
+        self.use_checkpoint = config.swin_use_checkpoint
 
-        self.enable_fusion = enable_fusion
-        self.fusion_type = fusion_type
+        self.enable_fusion = config.enable_fusion
+        self.fusion_type = config.fusion_type
 
         #  process mel-spec ; used only once
         self.freq_ratio = self.spec_size // self.config.mel_bins
-        window = 'hann'
-        center = True
-        pad_mode = 'reflect'
-        ref = 1.0
-        amin = 1e-10
-        top_db = None
-        self.interpolate_ratio = 32     # Downsampled ratio
+
+        self.interpolate_ratio = 32  # Downsampled ratio
         # Spectrogram extractor
         self.bn0 = nn.BatchNorm2d(self.config.mel_bins)
 
-
         # split spctrogram into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            img_size=self.spec_size, patch_size=self.patch_size, in_chans=self.in_chans, 
-            hidden_size=self.hidden_size, norm_layer=self.norm_layer, patch_stride = patch_stride,
-            enable_fusion=self.enable_fusion, fusion_type=self.fusion_type
-            )
+        self.patch_embed = CLAPAudioPatchEmbed(config)
 
         num_patches = self.patch_embed.num_patches
         patches_resolution = self.patch_embed.grid_size
         self.patches_resolution = patches_resolution
 
         # absolute position embedding
-        if self.ape:
+        if self.use_absolute_pos_embedding:
             self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, self.hidden_size))
-            trunc_normal_(self.absolute_pos_embed, std=.02)
+            trunc_normal_(self.absolute_pos_embed, std=0.02)
 
         self.pos_drop = nn.Dropout(p=self.drop_rate)
 
-        # stochastic depth
-        dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, sum(self.depths))]  # stochastic depth decay rule
-
         # build layers
         self.layers = nn.ModuleList()
         for i_layer in range(self.num_layers):
-            layer = BasicLayer(dim=int(self.hidden_size * 2 ** i_layer),
-                input_resolution=(patches_resolution[0] // (2 ** i_layer),
-                                    patches_resolution[1] // (2 ** i_layer)),
-                depth=self.depths[i_layer],
-                num_heads=self.num_heads[i_layer],
-                window_size=self.window_size,
-                mlp_ratio=self.mlp_ratio,
-                qkv_bias=self.qkv_bias, qk_scale=self.qk_scale,
-                drop=self.drop_rate, attn_drop=self.attn_drop_rate,
-                drop_path=dpr[sum(self.depths[:i_layer]):sum(self.depths[:i_layer + 1])],
-                norm_layer=self.norm_layer,
-                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
-                use_checkpoint=use_checkpoint,
-                norm_before_mlp=self.norm_before_mlp)
+            layer = CLAPAudioLayer(config=config, patches_resolution=patches_resolution, idx_layer=i_layer)
             self.layers.append(layer)
 
         self.norm = self.norm_layer(self.num_features)
         self.avgpool = nn.AdaptiveAvgPool1d(1)
         self.maxpool = nn.AdaptiveMaxPool1d(1)
-        
+
         SF = self.spec_size // (2 ** (len(self.depths) - 1)) // self.patch_stride[0] // self.freq_ratio
         self.tscam_conv = nn.Conv2d(
-            in_channels = self.num_features,
-            out_channels = self.num_classes,
-            kernel_size = (SF,3),
-            padding = (0,1)
+            in_channels=self.num_features, out_channels=self.num_classes, kernel_size=(SF, 3), padding=(0, 1)
         )
-        self.head = nn.Linear(num_classes, num_classes)
-
-        if (self.enable_fusion) and (self.fusion_type in ['daf_1d','aff_1d','iaff_1d']):
-            self.mel_conv1d = nn.Sequential(
-                nn.Conv1d(64, 64, kernel_size=5, stride=3, padding=2),
-                nn.BatchNorm1d(64)
-            )
-            if self.fusion_type == 'daf_1d':
-                self.fusion_model = DAF()
-            elif self.fusion_type == 'aff_1d':
-                self.fusion_model = AFF(channels=64, type='1D')
-            elif self.fusion_type == 'iaff_1d':
-                self.fusion_model = iAFF(channels=64, type='1D')
-                
-        self.apply(self._init_weights)
+        self.head = nn.Linear(self.num_classes, self.num_classes)
 
     def _init_weights(self, m):
         pass
@@ -2178,19 +2172,18 @@ def _init_weights(self, m):
 
     @torch.jit.ignore
     def no_weight_decay(self):
-        return {'absolute_pos_embed'}
+        return {"absolute_pos_embed"}
 
     @torch.jit.ignore
     def no_weight_decay_keywords(self):
-        return {'relative_position_bias_table'}
-
+        return {"relative_position_bias_table"}
 
-    def forward_features(self, x, longer_idx = None):
+    def forward_features(self, x, longer_idx=None):
         # A deprecated optimization for using a hierarchical output from different blocks
 
-        frames_num = x.shape[2]        
-        x = self.patch_embed(x, longer_idx = longer_idx)
-        if self.ape:
+        frames_num = x.shape[2]
+        x = self.patch_embed(x, longer_idx=longer_idx)
+        if self.use_absolute_pos_embedding:
             x = x + self.absolute_pos_embed
         x = self.pos_drop(x)
         for i, layer in enumerate(self.layers):
@@ -2200,39 +2193,41 @@ def forward_features(self, x, longer_idx = None):
         B, N, C = x.shape
         SF = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
         ST = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
-        x = x.permute(0,2,1).contiguous().reshape(B, C, SF, ST)
+        x = x.permute(0, 2, 1).contiguous().reshape(B, C, SF, ST)
         B, C, F, T = x.shape
         # group 2D CNN
         c_freq_bin = F // self.freq_ratio
         x = x.reshape(B, C, F // c_freq_bin, c_freq_bin, T)
-        x = x.permute(0,1,3,2,4).contiguous().reshape(B, C, c_freq_bin, -1)
+        x = x.permute(0, 1, 3, 2, 4).contiguous().reshape(B, C, c_freq_bin, -1)
         # get latent_output
-        fine_grained_latent_output = torch.mean(x, dim = 2)
-        fine_grained_latent_output = interpolate(fine_grained_latent_output.permute(0,2,1).contiguous(), 8 * self.patch_stride[1]) 
-        
-        latent_output = self.avgpool(torch.flatten(x,2))
+        fine_grained_latent_output = torch.mean(x, dim=2)
+        fine_grained_latent_output = interpolate(
+            fine_grained_latent_output.permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
+        )
+
+        latent_output = self.avgpool(torch.flatten(x, 2))
         latent_output = torch.flatten(latent_output, 1)
 
         # display the attention map, if needed
 
         x = self.tscam_conv(x)
-        x = torch.flatten(x, 2) # B, C, T
- 
-        fpx = interpolate(torch.sigmoid(x).permute(0,2,1).contiguous(), 8 * self.patch_stride[1]) 
-            
+        x = torch.flatten(x, 2)  # B, C, T
+
+        fpx = interpolate(torch.sigmoid(x).permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1])
+
         x = self.avgpool(x)
         x = torch.flatten(x, 1)
 
         output_dict = {
-            'framewise_output': fpx, # already sigmoided
-            'clipwise_output': torch.sigmoid(x),
-            'fine_grained_embedding': fine_grained_latent_output,
-            'embedding': latent_output
+            "framewise_output": fpx,  # already sigmoided
+            "clipwise_output": torch.sigmoid(x),
+            "fine_grained_embedding": fine_grained_latent_output,
+            "embedding": latent_output,
         }
 
         return output_dict
 
-    def crop_wav(self, x, crop_size, spe_pos = None):
+    def crop_wav(self, x, crop_size, spe_pos=None):
         time_steps = x.shape[2]
         tx = torch.zeros(x.shape[0], x.shape[1], crop_size, x.shape[3]).to(x.device)
         for i in range(len(x)):
@@ -2240,7 +2235,7 @@ def crop_wav(self, x, crop_size, spe_pos = None):
                 crop_pos = random.randint(0, time_steps - crop_size - 1)
             else:
                 crop_pos = spe_pos
-            tx[i][0] = x[i, 0, crop_pos:crop_pos + crop_size,:]
+            tx[i][0] = x[i, 0, crop_pos : crop_pos + crop_size, :]
         return tx
 
     # Reshape the wavform to a img size, if you want to use the pretrained swin transformer model
@@ -2254,13 +2249,13 @@ def reshape_wav2img(self, x):
             x = nn.functional.interpolate(x, (target_T, x.shape[3]), mode="bicubic", align_corners=True)
         if F < target_F:
             x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)
-        x = x.permute(0,1,3,2).contiguous()
+        x = x.permute(0, 1, 3, 2).contiguous()
         x = x.reshape(x.shape[0], x.shape[1], x.shape[2], self.freq_ratio, x.shape[3] // self.freq_ratio)
         # print(x.shape)
-        x = x.permute(0,1,3,2,4).contiguous()
+        x = x.permute(0, 1, 3, 2, 4).contiguous()
         x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3], x.shape[4])
         return x
-    
+
     # Repeat the wavform to a img size, if you want to use the pretrained swin transformer model
     def repeat_wat2img(self, x, cur_pos):
         B, C, T, F = x.shape
@@ -2271,13 +2266,15 @@ def repeat_wat2img(self, x, cur_pos):
         if T < target_T:
             x = nn.functional.interpolate(x, (target_T, x.shape[3]), mode="bicubic", align_corners=True)
         if F < target_F:
-            x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)  
-        x = x.permute(0,1,3,2).contiguous() # B C F T
-        x = x[:,:,:,cur_pos:cur_pos + self.spec_size]
-        x = x.repeat(repeats = (1,1,4,1))
+            x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)
+        x = x.permute(0, 1, 3, 2).contiguous()  # B C F T
+        x = x[:, :, :, cur_pos : cur_pos + self.spec_size]
+        x = x.repeat(repeats=(1, 1, 4, 1))
         return x
 
-    def forward(self, x: torch.Tensor, mixup_lambda = None, infer_mode = False, device=None):# out_feat_keys: List[str] = None):
+    def forward(
+        self, x: torch.Tensor, mixup_lambda=None, infer_mode=False, device=None
+    ):  # out_feat_keys: List[str] = None):
 
         if self.enable_fusion and x["longer"].sum() == 0:
             # if no audio is longer than 10s, then randomly select one audio to be longer
@@ -2285,8 +2282,8 @@ def forward(self, x: torch.Tensor, mixup_lambda = None, infer_mode = False, devi
 
         if not self.enable_fusion:
             x = x["waveform"].to(device=device, non_blocking=True)
-            x = self.spectrogram_extractor(x)   # (batch_size, 1, time_steps, freq_bins)
-            x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+            x = self.spectrogram_extractor(x)  # (batch_size, 1, time_steps, freq_bins)
+            x = self.logmel_extractor(x)  # (batch_size, 1, time_steps, mel_bins)
             x = x.transpose(1, 3)
             x = self.bn0(x)
             x = x.transpose(1, 3)
@@ -2295,7 +2292,7 @@ def forward(self, x: torch.Tensor, mixup_lambda = None, infer_mode = False, devi
 
             if self.training and mixup_lambda is not None:
                 x = do_mixup(x, mixup_lambda)
-                
+
             x = self.reshape_wav2img(x)
             output_dict = self.forward_features(x)
         else:
@@ -2305,30 +2302,33 @@ def forward(self, x: torch.Tensor, mixup_lambda = None, infer_mode = False, devi
             x = self.bn0(x)
             x = x.transpose(1, 3)
             longer_list_idx = torch.where(longer_list)[0]
-            if self.fusion_type in ['daf_1d','aff_1d','iaff_1d']:
-                new_x = x[:,0:1,:,:].clone().contiguous()
+            if self.fusion_type in ["daf_1d", "aff_1d", "iaff_1d"]:
+                new_x = x[:, 0:1, :, :].clone().contiguous()
                 if len(longer_list_idx) > 0:
-                # local processing
-                    fusion_x_local = x[longer_list_idx,1:,:,:].clone().contiguous()
-                    FB,FC,FT,FF = fusion_x_local.size()
+                    # local processing
+                    fusion_x_local = x[longer_list_idx, 1:, :, :].clone().contiguous()
+                    FB, FC, FT, FF = fusion_x_local.size()
                     fusion_x_local = fusion_x_local.view(FB * FC, FT, FF)
-                    fusion_x_local = torch.permute(fusion_x_local, (0,2,1)).contiguous()
+                    fusion_x_local = torch.permute(fusion_x_local, (0, 2, 1)).contiguous()
                     fusion_x_local = self.mel_conv1d(fusion_x_local)
-                    fusion_x_local = fusion_x_local.view(FB,FC,FF,fusion_x_local.size(-1))
-                    fusion_x_local = torch.permute(fusion_x_local, (0,2,1,3)).contiguous().flatten(2)
+                    fusion_x_local = fusion_x_local.view(FB, FC, FF, fusion_x_local.size(-1))
+                    fusion_x_local = torch.permute(fusion_x_local, (0, 2, 1, 3)).contiguous().flatten(2)
                     if fusion_x_local.size(-1) < FT:
-                        fusion_x_local = torch.cat([fusion_x_local, torch.zeros((FB,FF,FT- fusion_x_local.size(-1)), device=device)], dim=-1)
+                        fusion_x_local = torch.cat(
+                            [fusion_x_local, torch.zeros((FB, FF, FT - fusion_x_local.size(-1)), device=device)],
+                            dim=-1,
+                        )
                     else:
-                        fusion_x_local = fusion_x_local[:,:,:FT]
+                        fusion_x_local = fusion_x_local[:, :, :FT]
                     # 1D fusion
-                    new_x = new_x.squeeze(1).permute((0,2,1)).contiguous()
+                    new_x = new_x.squeeze(1).permute((0, 2, 1)).contiguous()
                     new_x[longer_list_idx] = self.fusion_model(new_x[longer_list_idx], fusion_x_local)
-                    x = new_x.permute((0,2,1)).contiguous()[:,None,:,:]
+                    x = new_x.permute((0, 2, 1)).contiguous()[:, None, :, :]
                 else:
                     x = new_x
 
-            elif self.fusion_type in ['daf_2d','aff_2d','iaff_2d','channel_map']:
-                x = x # no change
+            elif self.enable_fusion:
+                x = x  # no change
 
             if self.training:
                 x = self.spec_augmenter(x)
@@ -2336,7 +2336,6 @@ def forward(self, x: torch.Tensor, mixup_lambda = None, infer_mode = False, devi
                 x = do_mixup(x, mixup_lambda)
 
             x = self.reshape_wav2img(x)
-            output_dict = self.forward_features(x, longer_idx = longer_list_idx)
-       
+            output_dict = self.forward_features(x, longer_idx=longer_list_idx)
 
-        return output_dict
\ No newline at end of file
+        return output_dict
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index 79b1d95063d1..12a0d72dd279 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -61,8 +61,8 @@ def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
         Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
         and `kwargs` arguments to CLAPTokenizerFast's [`~CLAPTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
-        CLAPFeatureExtractor's [`~CLAPFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
+        CLAPFeatureExtractor's [`~CLAPFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
+        doctsring of the above two methods for more information.
 
         Args:
             text (`str`, `List[str]`, `List[List[str]]`):
@@ -70,9 +70,9 @@ def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
             audios (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a
-                number of channels, and T the sample length of the audio.
+                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
+                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
+                and T the sample length of the audio.
 
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
@@ -132,7 +132,8 @@ def model_input_names(self):
     @property
     def feature_extractor_class(self):
         warnings.warn(
-            "`feature_extractor_class` is deprecated and will be removed in v5. Use `feature_extractor_class` instead.",
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `feature_extractor_class`"
+            " instead.",
             FutureWarning,
         )
         return self.feature_extractor_class
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index d7f8fb0858a5..53465ca46730 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -24,7 +24,7 @@
 
 import requests
 import transformers
-from transformers import CLAPConfig, CLAPTextConfig, CLAPVisionConfig
+from transformers import CLAPAudioConfig, CLAPConfig, CLAPTextConfig
 from transformers.testing_utils import (
     is_flax_available,
     is_pt_flax_cross_test,
@@ -119,7 +119,7 @@ def prepare_config_and_inputs(self):
         return config, pixel_values
 
     def get_config(self):
-        return CLAPVisionConfig(
+        return CLAPAudioConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
@@ -181,7 +181,7 @@ class CLAPVisionModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = CLAPVisionModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CLAPVisionConfig, has_text_modality=False, hidden_size=37)
+        self.config_tester = ConfigTester(self, config_class=CLAPAudioConfig, has_text_modality=False, hidden_size=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -559,10 +559,10 @@ def _create_and_check_torchscript(self, config, inputs_dict):
     def test_load_vision_text_config(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        # Save CLAPConfig and check if we can load CLAPVisionConfig from it
+        # Save CLAPConfig and check if we can load CLAPAudioConfig from it
         with tempfile.TemporaryDirectory() as tmp_dir_name:
             config.save_pretrained(tmp_dir_name)
-            vision_config = CLAPVisionConfig.from_pretrained(tmp_dir_name)
+            vision_config = CLAPAudioConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
 
         # Save CLAPConfig and check if we can load CLAPTextConfig from it

From 45c36ba9513e60f34c8ceec70ff0a235adab0b7a Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 31 Jan 2023 11:13:30 +0000
Subject: [PATCH 006/197] more refactor

---
 .../models/clap/configuration_clap.py         |   2 +
 src/transformers/models/clap/modeling_clap.py | 105 ++++++------------
 2 files changed, 38 insertions(+), 69 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index f63358a2a2f1..9d1164d020d5 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -251,6 +251,7 @@ def __init__(
         swin_use_checkpoint=False,
         swin_absolute_positional_embedding=False,
         swin_hidden_act="gelu",
+        aff_block_r=4,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -290,6 +291,7 @@ def __init__(
         self.swin_absolute_positional_embedding = swin_absolute_positional_embedding
         self.patch_embed_input_channels = patch_embed_input_channels
         self.swin_hidden_act = swin_hidden_act
+        self.aff_block_r = aff_block_r
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 939723f68139..a8248ef5d9d0 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -209,8 +209,10 @@ class CLAPAudioAFFBlock(nn.Module):
     TODO: add docstring
     """
 
-    def __init__(self, channels=64, r=4):
+    def __init__(self, config: CLAPAudioConfig):
         super(CLAPAudioAFFBlock, self).__init__()
+        channels = config.patch_embeds_hidden_size
+        r = config.aff_block_r
         inter_channels = int(channels // r)
 
         self.local_att = nn.Sequential(
@@ -278,26 +280,19 @@ def __init__(self, config: CLAPAudioConfig):
 
         padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)
 
-        if (self.enable_fusion) and (self.fusion_type == "channel_map"):
-            self.proj = nn.Conv2d(
-                config.patch_embed_input_channels * 4,
-                config.patch_embeds_hidden_size,
-                kernel_size=patch_size,
-                stride=patch_stride,
-                padding=padding,
-            )
-        else:
-            self.proj = nn.Conv2d(
-                config.patch_embed_input_channels,
-                config.patch_embeds_hidden_size,
-                kernel_size=patch_size,
-                stride=patch_stride,
-                padding=padding,
-            )
+        scale_factor = 4 if (self.enable_fusion) and (self.fusion_type == "channel_map") else 1
+
+        self.proj = nn.Conv2d(
+            config.patch_embed_input_channels * scale_factor,
+            config.patch_embeds_hidden_size,
+            kernel_size=patch_size,
+            stride=patch_stride,
+            padding=padding,
+        )
 
         self.norm = nn.LayerNorm(config.patch_embeds_hidden_size) if config.enable_patch_layer_norm else nn.Identity()
         if self.enable_fusion:
-            self.fusion_model = CLAPAudioAFFBlock(channels=config.patch_embeds_hidden_size)
+            self.fusion_model = CLAPAudioAFFBlock(config)
 
     def forward(self, x, longer_idx=None):
         if self.enable_fusion:
@@ -400,11 +395,8 @@ def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
     normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean},
     \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for
     generating the random values works best when :math:`a \leq \text{mean} \leq b`.
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
+        tensor: an n-dimensional `torch.Tensor` mean: the mean of the normal distribution std: the standard deviation
+        of the normal distribution a: the minimum cutoff value b: the maximum cutoff value
     Examples:
         >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w)
     """
@@ -469,13 +461,13 @@ def window_reverse(windows, window_size, H, W):
 
 
 class CLAPAudioWindowAttention(nn.Module):
-    def __init__(self, dim, window_size, num_heads, config=None):
+    def __init__(self, config, hidden_dim, window_size, num_heads):
 
         super().__init__()
-        self.dim = dim
+        self.hidden_dim = hidden_dim
         self.window_size = window_size  # Wh, Ww
         self.num_heads = num_heads
-        head_dim = dim // num_heads
+        head_dim = self.hidden_dim // num_heads
         self.scale = head_dim**-0.5
 
         # define a parameter table of relative position bias
@@ -496,9 +488,9 @@ def __init__(self, dim, window_size, num_heads, config=None):
         relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
         self.register_buffer("relative_position_index", relative_position_index)
 
-        self.qkv = nn.Linear(dim, dim * 3, bias=config.swin_qkv_bias)
+        self.qkv = nn.Linear(self.hidden_dim, self.hidden_dim * 3, bias=config.swin_qkv_bias)
         self.attn_drop = nn.Dropout(config.swin_attention_drop_rate)
-        self.proj = nn.Linear(dim, dim)
+        self.proj = nn.Linear(self.hidden_dim, self.hidden_dim)
         self.proj_drop = nn.Dropout(config.swin_drop_rate)
 
         trunc_normal_(self.relative_position_bias_table, std=0.02)
@@ -545,17 +537,16 @@ def extra_repr(self):
 class CLAPAudioSwinTransformerBlock(nn.Module):
     def __init__(
         self,
-        hidden_dim,
+        config,
         input_resolution,
-        num_heads,
         shift_size=0,
         drop_path=0.0,
-        config=None,
+        idx_layer=0,
     ):
         super().__init__()
-        self.hidden_dim = hidden_dim
+        self.hidden_dim = config.hidden_size * 2**idx_layer
         self.input_resolution = input_resolution
-        self.num_heads = num_heads
+        self.num_heads = config.num_heads[idx_layer]
         self.window_size = config.window_size
         self.shift_size = shift_size
         self.mlp_ratio = config.swin_mlp_ratio
@@ -567,20 +558,23 @@ def __init__(
             self.window_size = min(self.input_resolution)
         assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
 
-        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.norm1 = nn.LayerNorm(self.hidden_dim)
         self.attn = CLAPAudioWindowAttention(
-            hidden_dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, config=config
+            config=config,
+            hidden_dim=self.hidden_dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=self.num_heads,
         )
 
         self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
         if self.norm_before_mlp == "ln":
-            self.norm2 = nn.LayerNorm(hidden_dim)
+            self.norm2 = nn.LayerNorm(self.hidden_dim)
         elif self.norm_before_mlp == "bn":
-            self.norm2 = lambda x: nn.BatchNorm1d(hidden_dim)(x.transpose(1, 2)).transpose(1, 2)
+            self.norm2 = lambda x: nn.BatchNorm1d(self.hidden_dim)(x.transpose(1, 2)).transpose(1, 2)
         else:
             raise NotImplementedError
-        mlp_hidden_dim = int(hidden_dim * self.mlp_ratio)
-        self.mlp = CLAPAudioMLP(in_features=hidden_dim, hidden_features=mlp_hidden_dim, config=config)
+        mlp_hidden_dim = int(self.hidden_dim * self.mlp_ratio)
+        self.mlp = CLAPAudioMLP(in_features=self.hidden_dim, hidden_features=mlp_hidden_dim, config=config)
 
         if self.shift_size > 0:
             # calculate attention mask for SW-MSA
@@ -1599,7 +1593,7 @@ class CLAPPreTrainedModel(PreTrainedModel):
     config_class = CLAPConfig
     base_model_prefix = "clap"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t", r"vision_model.*"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"]
 
     def _init_weights(self, module):
         pass
@@ -1968,31 +1962,12 @@ def forward(
 
 
 class CLAPAudioLayer(nn.Module):
-    """A basic Swin Transformer layer for one stage.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-    """
-
     def __init__(self, config, idx_layer=0, patches_resolution=0):
         super().__init__()
 
         hidden_dim = config.hidden_size * 2**idx_layer
         input_resolution = (patches_resolution[0] // (2**idx_layer), patches_resolution[1] // (2**idx_layer))
         depth = config.depths[idx_layer]
-        num_heads = config.num_heads[idx_layer]
         window_size = config.window_size
         norm_layer = nn.LayerNorm if config.enable_patch_layer_norm else None
 
@@ -2013,12 +1988,11 @@ def __init__(self, config, idx_layer=0, patches_resolution=0):
         self.blocks = nn.ModuleList(
             [
                 CLAPAudioSwinTransformerBlock(
-                    hidden_dim=hidden_dim,
+                    config,
                     input_resolution=input_resolution,
-                    num_heads=num_heads,
                     shift_size=0 if (i % 2 == 0) else window_size // 2,
                     drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                    config=config,
+                    idx_layer=idx_layer,
                 )
                 for i in range(depth)
             ]
@@ -2048,13 +2022,6 @@ def forward(self, x):
 
 
 class CLAPAudioPatchMerging(nn.Module):
-    r"""Patch Merging Layer.
-    Args:
-        input_resolution (tuple[int]): Resolution of input feature.
-        dim (int): Number of input channels.
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
     def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
         super().__init__()
         self.input_resolution = input_resolution

From abee38283920673784e0827bff4b85856a393ae2 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 31 Jan 2023 11:15:31 +0000
Subject: [PATCH 007/197] more refactor

---
 src/transformers/models/clap/configuration_clap.py | 2 +-
 src/transformers/models/clap/modeling_clap.py      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 9d1164d020d5..9ba4c7dc589c 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -364,7 +364,7 @@ def __init__(
         self,
         text_config=None,
         vision_config=None,
-        logit_scale_init_value=2.6592,
+        logit_scale_init_value=(1 / 0.07),
         fusion_num_hidden_layers=2,
         projection_dim=512,
         projection_hidden_act="relu",
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index a8248ef5d9d0..c9aaa5572a99 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1676,8 +1676,8 @@ def __init__(self, config: CLAPConfig):
         text_config = config.text_config
         vision_config = config.vision_config
 
-        self.logit_scale_a = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-        self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.logit_scale_a = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value))
+        self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value))
 
         self.projection_dim = config.projection_dim
         self.text_hidden_size = text_config.hidden_size

From a7219ec2b625baf9d01c890eee5385f0966d43d5 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 31 Jan 2023 11:23:25 +0000
Subject: [PATCH 008/197] more refactor

---
 src/transformers/models/clap/modeling_clap.py | 53 ++++---------------
 1 file changed, 9 insertions(+), 44 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index c9aaa5572a99..b2f5f531bce9 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1743,60 +1743,24 @@ def get_text_features(
         return text_features
 
     @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
-    def get_image_features(
+    def get_audio_features(
         self,
-        pixel_values: Optional[torch.FloatTensor] = None,
+        input_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> torch.FloatTensor:
         r"""
-        Returns:
-            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
-            applying the projection layer to the pooled output of [`CLAPVisionModel`].
-
-        Examples:
-
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, CLAPModel
-
-        >>> model = CLAPModel.from_pretrained("laion-ai/base")
-        >>> processor = AutoProcessor.from_pretrained("laion-ai/base")
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> inputs = processor(images=image, return_tensors="pt")
-
-        >>> image_features = model.get_image_features(**inputs)
-        ```"""
-        # Use CLAP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = vision_outputs[1]  # pooled_output
-        image_features = self.visual_projection(pooled_output)
-
-        return image_features
+        """
+        pass
 
     @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CLAPOutput, config_class=CLAPConfig)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
+        input_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         return_loss: Optional[bool] = None,
@@ -1835,8 +1799,9 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
+        vision_outputs = self.audio_model(
+            input_values=input_values,
+            attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,

From c5533158c0cde4212f903a00cac2464f707880c9 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 31 Jan 2023 11:31:11 +0000
Subject: [PATCH 009/197] correct fusion

---
 src/transformers/models/clap/configuration_clap.py | 2 ++
 src/transformers/models/clap/modeling_clap.py      | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 9ba4c7dc589c..fa97f0035299 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -252,6 +252,7 @@ def __init__(
         swin_absolute_positional_embedding=False,
         swin_hidden_act="gelu",
         aff_block_r=4,
+        enable_patch_fusion=False,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -292,6 +293,7 @@ def __init__(
         self.patch_embed_input_channels = patch_embed_input_channels
         self.swin_hidden_act = swin_hidden_act
         self.aff_block_r = aff_block_r
+        self.enable_patch_fusion = enable_patch_fusion
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index b2f5f531bce9..91cf65a015e5 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -275,7 +275,7 @@ def __init__(self, config: CLAPAudioConfig):
         self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1])
         self.num_patches = self.grid_size[0] * self.grid_size[1]
         self.flatten = config.flatten_patch_embeds
-        self.enable_fusion = config.enable_patch_layer_norm
+        self.enable_fusion = config.enable_patch_fusion
         self.fusion_type = config.fusion_type
 
         padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)

From 4360623590877aabe2a903ccc5e1f39d826b1cc7 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 31 Jan 2023 11:41:14 +0000
Subject: [PATCH 010/197] more refactor

---
 src/transformers/models/clap/modeling_clap.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 91cf65a015e5..0fb99723eaa3 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -461,18 +461,18 @@ def window_reverse(windows, window_size, H, W):
 
 
 class CLAPAudioWindowAttention(nn.Module):
-    def __init__(self, config, hidden_dim, window_size, num_heads):
+    def __init__(self, config, hidden_dim, num_heads):
 
         super().__init__()
         self.hidden_dim = hidden_dim
-        self.window_size = window_size  # Wh, Ww
+        self.window_size = to_2tuple(config.window_size)  # Wh, Ww
         self.num_heads = num_heads
         head_dim = self.hidden_dim // num_heads
         self.scale = head_dim**-0.5
 
         # define a parameter table of relative position bias
         self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+            torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
         )  # 2*Wh-1 * 2*Ww-1, nH
 
         # get pair-wise relative position index for each token inside the window
@@ -547,8 +547,8 @@ def __init__(
         self.hidden_dim = config.hidden_size * 2**idx_layer
         self.input_resolution = input_resolution
         self.num_heads = config.num_heads[idx_layer]
-        self.window_size = config.window_size
         self.shift_size = shift_size
+        self.window_size = config.window_size
         self.mlp_ratio = config.swin_mlp_ratio
         self.norm_before_mlp = config.swin_norm_before_mlp
 
@@ -562,7 +562,6 @@ def __init__(
         self.attn = CLAPAudioWindowAttention(
             config=config,
             hidden_dim=self.hidden_dim,
-            window_size=to_2tuple(self.window_size),
             num_heads=self.num_heads,
         )
 

From e3aff6f387bd13973a847aa221788eb871e8d75c Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 31 Jan 2023 18:49:03 +0000
Subject: [PATCH 011/197] new modules

---
 .../models/clap/configuration_clap.py         |  24 ++++
 src/transformers/models/clap/modeling_clap.py | 126 ++++++++++--------
 2 files changed, 92 insertions(+), 58 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index fa97f0035299..5d065d294798 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -253,6 +253,18 @@ def __init__(
         swin_hidden_act="gelu",
         aff_block_r=4,
         enable_patch_fusion=False,
+        spectrogram_window_size=1024,
+        spectrogram_window='hann',
+        spectrogram_center=True,
+        spectrogram_pad_mode='reflect',
+        spectrogram_freeze_parameters=True,
+        spectrogram_ref=1.0,
+        spectrogram_amin=1e-10,
+        spectrogram_top_db=None,
+        spectrogram_time_drop_width=64, 
+        spectrogram_time_stripes_num=2, 
+        spectrogram_freq_drop_width=8, 
+        spectrogram_freq_stripes_num=2,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -294,6 +306,18 @@ def __init__(
         self.swin_hidden_act = swin_hidden_act
         self.aff_block_r = aff_block_r
         self.enable_patch_fusion = enable_patch_fusion
+        self.spectrogram_window_size = spectrogram_window_size
+        self.spectrogram_window = spectrogram_window
+        self.spectrogram_center = spectrogram_center
+        self.spectrogram_pad_mode = spectrogram_pad_mode
+        self.spectrogram_freeze_parameters = spectrogram_freeze_parameters
+        self.spectrogram_ref = spectrogram_ref
+        self.spectrogram_amin = spectrogram_amin
+        self.spectrogram_top_db = spectrogram_top_db
+        self.spectrogram_time_drop_width = spectrogram_time_drop_width
+        self.spectrogram_time_stripes_num = spectrogram_time_stripes_num
+        self.spectrogram_freq_drop_width = spectrogram_freq_drop_width
+        self.spectrogram_freq_stripes_num = spectrogram_freq_stripes_num
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 0fb99723eaa3..a5743c791a38 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -27,6 +27,9 @@
 from torch import nn
 from torch.nn.init import _calculate_fan_in_and_fan_out
 
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+
 from ...activations import ACT2FN
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
@@ -1744,7 +1747,9 @@ def get_text_features(
     @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
     def get_audio_features(
         self,
-        input_values: Optional[torch.Tensor] = None,
+        mel_fusion: Optional[torch.Tensor] = None,
+        longer: Optional[torch.Tensor] = None,
+        waveform: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1752,7 +1757,28 @@ def get_audio_features(
     ) -> torch.FloatTensor:
         r"""
         """
-        pass
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        audio_outputs = self.audio_model(
+            mel_fusion=mel_fusion,
+            longer=longer,
+            waveform=waveform,
+            # attention_mask=attention_mask,
+            # output_attentions=output_attentions,
+            # output_hidden_states=output_hidden_states,
+            # return_dict=return_dict,
+        )
+
+        pooled_output = audio_outputs[1]
+
+        audio_features = self.audio_projection(pooled_output)
+        audio_features = F.normalize(audio_features, dim=-1)
+
+        return audio_features
 
     @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CLAPOutput, config_class=CLAPConfig)
@@ -2091,6 +2117,35 @@ def __init__(self, config: CLAPAudioConfig):
         )
         self.head = nn.Linear(self.num_classes, self.num_classes)
 
+        self.spectrogram_extractor = Spectrogram(
+            n_fft=config.spectrogram_window_size, 
+            hop_length=config.hop_size, 
+            win_length=config.spectrogram_window_size, 
+            window=config.spectrogram_window, 
+            center=config.spectrogram_center, 
+            pad_mode=config.spectrogram_pad_mode, 
+            freeze_parameters=config.spectrogram_freeze_parameters
+        )
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(
+            sr=config.sample_rate, 
+            n_fft=config.spectrogram_window_size, 
+            n_mels=config.mel_bins, 
+            fmin=config.fmin, 
+            fmax=config.fmax, 
+            ref=config.spectrogram_ref, 
+            amin=config.spectrogram_amin, 
+            top_db=config.spectrogram_top_db, 
+            freeze_parameters=config.spectrogram_freeze_parameters,
+        )
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(
+            time_drop_width=config.spectrogram_time_drop_width, 
+            time_stripes_num=config.spectrogram_time_stripes_num, 
+            freq_drop_width=config.spectrogram_freq_drop_width, 
+            freq_stripes_num=config.spectrogram_freq_stripes_num,
+        ) 
+
     def _init_weights(self, m):
         pass
         # if isinstance(m, nn.Linear):
@@ -2101,14 +2156,6 @@ def _init_weights(self, m):
         #     nn.init.constant_(m.bias, 0)
         #     nn.init.constant_(m.weight, 1.0)
 
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {"absolute_pos_embed"}
-
-    @torch.jit.ignore
-    def no_weight_decay_keywords(self):
-        return {"relative_position_bias_table"}
-
     def forward_features(self, x, longer_idx=None):
         # A deprecated optimization for using a hierarchical output from different blocks
 
@@ -2187,32 +2234,21 @@ def reshape_wav2img(self, x):
         x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3], x.shape[4])
         return x
 
-    # Repeat the wavform to a img size, if you want to use the pretrained swin transformer model
-    def repeat_wat2img(self, x, cur_pos):
-        B, C, T, F = x.shape
-        target_T = int(self.spec_size * self.freq_ratio)
-        target_F = self.spec_size // self.freq_ratio
-        assert T <= target_T and F <= target_F, "the wav size should less than or equal to the swin input size"
-        # to avoid bicubic zero error
-        if T < target_T:
-            x = nn.functional.interpolate(x, (target_T, x.shape[3]), mode="bicubic", align_corners=True)
-        if F < target_F:
-            x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)
-        x = x.permute(0, 1, 3, 2).contiguous()  # B C F T
-        x = x[:, :, :, cur_pos : cur_pos + self.spec_size]
-        x = x.repeat(repeats=(1, 1, 4, 1))
-        return x
-
     def forward(
-        self, x: torch.Tensor, mixup_lambda=None, infer_mode=False, device=None
+        self, 
+        mel_fusion=None,
+        longer=None,
+        waveform=None,
+        mixup_lambda=None, 
+        device=None
     ):  # out_feat_keys: List[str] = None):
 
-        if self.enable_fusion and x["longer"].sum() == 0:
+        if self.enable_fusion and longer.sum() == 0:
             # if no audio is longer than 10s, then randomly select one audio to be longer
-            x["longer"][torch.randint(0, x["longer"].shape[0], (1,))] = True
+            longer[torch.randint(0, longer.shape[0], (1,))] = True
 
         if not self.enable_fusion:
-            x = x["waveform"].to(device=device, non_blocking=True)
+            x = waveform.to(device=device, non_blocking=True)
             x = self.spectrogram_extractor(x)  # (batch_size, 1, time_steps, freq_bins)
             x = self.logmel_extractor(x)  # (batch_size, 1, time_steps, mel_bins)
             x = x.transpose(1, 3)
@@ -2227,39 +2263,13 @@ def forward(
             x = self.reshape_wav2img(x)
             output_dict = self.forward_features(x)
         else:
-            longer_list = x["longer"].to(device=device, non_blocking=True)
-            x = x["mel_fusion"].to(device=device, non_blocking=True)
+            longer_list = longer.to(device=device, non_blocking=True)
+            x = mel_fusion.to(device=device, non_blocking=True)
             x = x.transpose(1, 3)
             x = self.bn0(x)
             x = x.transpose(1, 3)
             longer_list_idx = torch.where(longer_list)[0]
-            if self.fusion_type in ["daf_1d", "aff_1d", "iaff_1d"]:
-                new_x = x[:, 0:1, :, :].clone().contiguous()
-                if len(longer_list_idx) > 0:
-                    # local processing
-                    fusion_x_local = x[longer_list_idx, 1:, :, :].clone().contiguous()
-                    FB, FC, FT, FF = fusion_x_local.size()
-                    fusion_x_local = fusion_x_local.view(FB * FC, FT, FF)
-                    fusion_x_local = torch.permute(fusion_x_local, (0, 2, 1)).contiguous()
-                    fusion_x_local = self.mel_conv1d(fusion_x_local)
-                    fusion_x_local = fusion_x_local.view(FB, FC, FF, fusion_x_local.size(-1))
-                    fusion_x_local = torch.permute(fusion_x_local, (0, 2, 1, 3)).contiguous().flatten(2)
-                    if fusion_x_local.size(-1) < FT:
-                        fusion_x_local = torch.cat(
-                            [fusion_x_local, torch.zeros((FB, FF, FT - fusion_x_local.size(-1)), device=device)],
-                            dim=-1,
-                        )
-                    else:
-                        fusion_x_local = fusion_x_local[:, :, :FT]
-                    # 1D fusion
-                    new_x = new_x.squeeze(1).permute((0, 2, 1)).contiguous()
-                    new_x[longer_list_idx] = self.fusion_model(new_x[longer_list_idx], fusion_x_local)
-                    x = new_x.permute((0, 2, 1)).contiguous()[:, None, :, :]
-                else:
-                    x = new_x
 
-            elif self.enable_fusion:
-                x = x  # no change
 
             if self.training:
                 x = self.spec_augmenter(x)

From 00eb73ba518eab1ed08da19ec370e452cd7c016f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 09:38:27 +0000
Subject: [PATCH 012/197] add basic processor

---
 .../feature_extraction_sequence_utils.py      | 242 ++++++++++++++
 .../models/clap/feature_extraction_clap.py    | 305 ++++++++++--------
 .../models/clap/processing_clap.py            |  13 +-
 3 files changed, 414 insertions(+), 146 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index c221cec6d832..36fa9747561c 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -18,6 +18,10 @@
 from typing import Dict, List, Optional, Union
 
 import numpy as np
+from numpy.fft import fft
+import math
+
+import warnings
 
 from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from .utils import PaddingStrategy, TensorType, is_tf_tensor, is_torch_tensor, logging, to_numpy
@@ -364,3 +368,241 @@ def _get_padding_strategies(self, padding=False, max_length=None):
             )
 
         return padding_strategy
+    
+    @staticmethod
+    def hz_to_mel(freq: float, mel_scale: str = "htk") -> float:
+        r"""Convert Hz to Mels.
+
+        Args:
+            freqs (float): 
+                Frequencies in Hz
+            mel_scale (str, *optional*): 
+                Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+        Returns:
+            mels (float): Frequency in Mels
+        """
+
+        if mel_scale not in ["slaney", "htk"]:
+            raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+        if mel_scale == "htk":
+            return 2595.0 * math.log10(1.0 + (freq / 700.0))
+
+        # Fill in the linear part
+        f_min = 0.0
+        f_sp = 200.0 / 3
+
+        mels = (freq - f_min) / f_sp
+
+        # Fill in the log-scale part
+        min_log_hz = 1000.0
+        min_log_mel = (min_log_hz - f_min) / f_sp
+        logstep = math.log(6.4) / 27.0
+
+        if freq >= min_log_hz:
+            mels = min_log_mel + math.log(freq / min_log_hz) / logstep
+
+        return mels
+
+    @staticmethod
+    def mel_to_hz(mels: np.array, mel_scale: str = "htk") -> np.array:
+        """Convert mel bin numbers to frequencies.
+
+        Args:
+            mels (np.array): Mel frequencies
+            mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+        Returns:
+            freqs (np.array): Mels converted in Hz
+        """
+
+        if mel_scale not in ["slaney", "htk"]:
+            raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+        if mel_scale == "htk":
+            return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+
+        # Fill in the linear scale
+        f_min = 0.0
+        f_sp = 200.0 / 3
+        freqs = f_min + f_sp * mels
+
+        # And now the nonlinear scale
+        min_log_hz = 1000.0
+        min_log_mel = (min_log_hz - f_min) / f_sp
+        logstep = math.log(6.4) / 27.0
+
+        log_t = mels >= min_log_mel
+        freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
+
+        return freqs
+
+    @staticmethod
+    def create_triangular_filterbank(
+        all_freqs: np.array,
+        f_pts: np.array,
+    ) -> np.array:
+        """Create a triangular filter bank.
+
+        Args:
+            all_freqs (np.array): STFT freq points of size (`n_freqs`).
+            f_pts (np.array): Filter mid points of size (`n_filter`).
+
+        Returns:
+            fb (np.array): The filter bank of size (`n_freqs`, `n_filter`).
+        """
+        # Adopted from Librosa
+        # calculate the difference between each filter mid point and each stft freq point in hertz
+        f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
+        slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1) # (n_freqs, n_filter + 2)
+        # create overlapping triangles
+        zero = np.zeros(1)
+        down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_filter)
+        up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_filter)
+        fb = np.maximum(zero, np.minimum(down_slopes, up_slopes))
+
+        return fb
+
+    def get_mel_filter_banks(
+        self,
+        n_freqs: int,
+        f_min: float,
+        f_max: float,
+        n_mels: int,
+        sample_rate: int,
+        norm: Optional[str] = None,
+        mel_scale: str = "htk",
+    ) -> np.array:
+        r"""Create a frequency bin conversion matrix used to obtain the Mel Frequency Cepstral Coefficient. 
+        This is called a `mel filter bank`, and various implementation exist, which differ in the number of filters, 
+        the shape of the filters, the way the filters are spaced, the bandwidth of
+        the filters, and the manner in which the spectrum is warped. The goal of these features is to approximate the non-linear human perception
+        of the variation in pitch with respect to the frequency. 
+        This code is heavily inspired from the `torchaudio` implementation, refer to XXX for more details.
+        
+
+        Note:
+            We will try to specify which variation correspond to which MFCCs from the litterature. The main features are: 
+                - MFCC FB-20: introduced in 1980 by Davis and Mermelstein [4]; Davis and Mermelstein assume sampling frequency of 10 kHz; speech bandwidth [0, 4600] Hz
+                - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) described in Young, 1995 [5]; Young uses a filter bank of 24 filters for speech bandwidth [0, 8000] Hz (sampling rate ≥ 16 kHz)
+                - MFCC FB-40: from the Auditory Toolbox for MATLAB [6] written by Slaney in 1998; Slaney assumes sampling rate of 16 kHz, and speech bandwidth [133, 6854] Hz
+                - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris, 2004 [3]; Skowronski and Harris assume sampling rate of 12.5 kHz and speech bandwidth [0, 6250] Hz
+            
+            
+        Args:
+            n_freqs (int): Number of frequencies to highlight/apply
+            f_min (float): Minimum frequency (Hz)
+            f_max (float): Maximum frequency (Hz)
+            n_mels (int): Number of mel filterbanks
+            sample_rate (int): Sample rate of the audio waveform
+            norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
+                (area normalization). (Default: ``None``)
+            mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+        Returns:
+            Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``)
+            meaning number of frequencies to highlight/apply to x the number of filterbanks.
+            Each column is a filterbank so that assuming there is a matrix A of
+            size (..., ``n_freqs``), the applied result would be
+            ``A * melscale_fbanks(A.size(-1), ...)``.
+
+        """
+
+        if norm is not None and norm != "slaney":
+            raise ValueError('norm must be one of None or "slaney"')
+
+        # freq bins
+        all_freqs = np.linspace(0, sample_rate // 2, n_freqs)
+
+        # calculate mel freq bins
+        m_min = self.hz_to_mel(f_min, mel_scale=mel_scale)
+        m_max = self.hz_to_mel(f_max, mel_scale=mel_scale)
+
+        m_pts = np.linspace(m_min, m_max, n_mels + 2)
+        f_pts = self.mel_to_hz(m_pts, mel_scale=mel_scale)
+
+        # create filterbank
+        filterbank = self.create_triangular_filterbank(all_freqs, f_pts)
+
+        if norm is not None and norm == "slaney":
+            # Slaney-style mel is scaled to be approx constant energy per channel
+            enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels])
+            filterbank *= np.expand_dims(enorm,0)
+
+        if (filterbank.max(axis=0) == 0.0).any():
+            warnings.warn(
+                "At least one mel filterbank has all zero values. "
+                f"The value for `n_mels` ({n_mels}) may be set too high. "
+                f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
+            )
+
+        return filterbank
+        
+    def _stft(self, frames, window):
+        """
+        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
+        results as `torch.stft`.
+        """
+        frame_size = frames.shape[1]
+        fft_size = self.n_fft
+
+        if fft_size is None:
+            fft_size = frame_size
+
+        if fft_size < frame_size:
+            raise ValueError("FFT size must greater or equal the frame size")
+        # number of FFT bins to store
+        num_fft_bins = (fft_size >> 1) + 1
+
+        data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
+        fft_signal = np.zeros(fft_size)
+
+        for f, frame in enumerate(frames):
+            if window is not None:
+                np.multiply(frame, window, out=fft_signal[:frame_size])
+            else:
+                fft_signal[:frame_size] = frame
+            # TODO can we use fftn on the other dimensions? 
+            data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
+        return data.T
+        
+        
+    def _fram_wave(self, waveform, center=True):
+        """
+        Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is
+        contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each
+        new frame.
+
+        Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`.
+        """
+        frames = []
+        for i in range(0, waveform.shape[0] + 1, self.hop_length):
+            half_window = (self.n_fft - 1) // 2 + 1
+            if center:
+                start = i - half_window if i > half_window else 0
+                end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
+
+                frame = waveform[start:end]
+                
+                # TODO can all of this be automatically replaced with np.pad(audio,self.n_fft // 2, self.n_fft // 2), mode=self.pad_mode)
+                # as we have an array of frames
+                
+                if start == 0:
+                    padd_width = (-i + half_window, 0)
+                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+                elif end == waveform.shape[0]:
+                    padd_width = (0, (i - waveform.shape[0] + half_window))
+                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+            else:
+                frame = waveform[i : i + self.n_fft]
+                frame_width = frame.shape[0]
+                if frame_width < waveform.shape[0]:
+                    frame = np.lib.pad(
+                        frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0
+                    )
+
+            frames.append(frame)
+        return np.stack(frames, 0)
\ No newline at end of file
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index cf31ef053127..6605439893ec 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -17,6 +17,7 @@
 from typing import List, Optional, Union
 
 import numpy as np
+import torchvision
 from numpy.fft import fft
 
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
@@ -64,6 +65,11 @@ def __init__(
         n_fft=400,
         padding_value=0.0,
         return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
+        norm = None,
+        f_min:float =0,
+        f_max:float =14000,
+        top_db:int = None,
+        mel_scale: str = "htk",
         **kwargs
     ):
         super().__init__(
@@ -79,139 +85,147 @@ def __init__(
         self.n_samples = chunk_length * sampling_rate
         self.nb_max_frames = self.n_samples // hop_length
         self.sampling_rate = sampling_rate
-        self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=feature_size)
-
-    def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
-        # Initialize the weights
-        n_mels = int(n_mels)
-        weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
-
-        # Center freqs of each FFT bin
-        fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
-
-        # 'Center freqs' of mel bands - uniformly spaced between limits
-        min_mel = 0.0
-        max_mel = 45.245640471924965
-
-        mels = np.linspace(min_mel, max_mel, n_mels + 2)
-
-        mels = np.asanyarray(mels)
-
-        # Fill in the linear scale
-        f_min = 0.0
-        f_sp = 200.0 / 3
-        freqs = f_min + f_sp * mels
-
-        # And now the nonlinear scale
-        min_log_hz = 1000.0  # beginning of log region (Hz)
-        min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-        logstep = np.log(6.4) / 27.0  # step size for log region
-
-        # If we have vector data, vectorize
-        log_t = mels >= min_log_mel
-        freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
-
-        mel_f = freqs
-
-        fdiff = np.diff(mel_f)
-        ramps = np.subtract.outer(mel_f, fftfreqs)
-
-        for i in range(n_mels):
-            # lower and upper slopes for all bins
-            lower = -ramps[i] / fdiff[i]
-            upper = ramps[i + 2] / fdiff[i + 1]
-
-            # .. then intersect them with each other and zero
-            weights[i] = np.maximum(0, np.minimum(lower, upper))
-
-        # Slaney-style mel is scaled to be approx constant energy per channel
-        enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
-        weights *= enorm[:, np.newaxis]
-
-        return weights
-
-    def fram_wave(self, waveform, center=True):
-        """
-        Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is
-        contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each
-        new frame.
-
-        Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`.
+        self.f_min = f_min # should be in super and would initialized them
+        self.f_max = f_max # should be in super and would initialized them
+        self.norm = norm # should be in super and would initialized them
+        self.mel_filters = self.get_mel_filter_banks(n_freqs = int(1+ n_fft//2), n_mels = feature_size, f_min = f_min, f_max = f_max, sample_rate = sampling_rate, norm = "htk", mel_scale = "htk")
+        self.mel_filters_slaney = self.get_mel_filter_banks(n_freqs = int(1+ n_fft//2), n_mels = feature_size, f_min = f_min, f_max = f_max, sample_rate = sampling_rate, norm = "slaney", mel_scale = "slaney")
+        self.top_db = top_db
+    
+    def _power_to_db(self, mel_spectrogram, a_min=1e-10, ref=1.0):
+        """ 
+        Power to db, this function is the numpy implementation of 
+        librosa.power_to_lb
         """
-        frames = []
-        for i in range(0, waveform.shape[0] + 1, self.hop_length):
-            half_window = (self.n_fft - 1) // 2 + 1
-            if center:
-                start = i - half_window if i > half_window else 0
-                end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
-
-                frame = waveform[start:end]
-
-                if start == 0:
-                    padd_width = (-i + half_window, 0)
-                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
-
-                elif end == waveform.shape[0]:
-                    padd_width = (0, (i - waveform.shape[0] + half_window))
-                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
-
-            else:
-                frame = waveform[i : i + self.n_fft]
-                frame_width = frame.shape[0]
-                if frame_width < waveform.shape[0]:
-                    frame = np.lib.pad(
-                        frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0
-                    )
-
-            frames.append(frame)
-        return np.stack(frames, 0)
-
-    def stft(self, frames, window):
+        log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
+        log_spec -= 10.0 * np.log10(np.maximum(a_min, ref))
+        if self.top_db is not None:
+            if self.top_db < 0:
+                raise ValueError("top_db must be non-negative")
+            log_spec = np.clip(log_spec,  min=np.maximum(log_spec) - self.top_db, max=np.inf)
+        return log_spec
+    
+    def _np_extract_fbank_features(self, waveform: np.array, mel_filters:Optional[np.array]) -> np.ndarray:
         """
-        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
-        results as `torch.stft`.
+        Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
+        implementation with 1e-5 tolerance.
         """
-        frame_size = frames.shape[1]
-        fft_size = self.n_fft
-
-        if fft_size is None:
-            fft_size = frame_size
+        window = np.hanning(self.n_fft + 1)[:-1]
 
-        if fft_size < frame_size:
-            raise ValueError("FFT size must greater or equal the frame size")
-        # number of FFT bins to store
-        num_fft_bins = (fft_size >> 1) + 1
+        frames = self._fram_wave(waveform)
+        stft = self._stft(frames, window=window)
+        
+        # if the imaginary parts are taken : (real, imag) = stftl; real ** 2 + imag ** 2
+        magnitudes = np.abs(stft) ** 2
+        mel_spec = np.matmul(magnitudes, self.mel_filters)
 
-        data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
-        fft_signal = np.zeros(fft_size)
+        return self._power_to_db(mel_spec)
 
-        for f, frame in enumerate(frames):
-            if window is not None:
-                np.multiply(frame, window, out=fft_signal[:frame_size])
-            else:
-                fft_signal[:frame_size] = frame
-            data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
-        return data.T
-
-    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
+    @staticmethod
+    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
+    def zero_mean_unit_var_norm(
+        input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
+    ) -> List[np.ndarray]:
         """
-        Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
-        implementation with 1e-5 tolerance.
+        Every array in the list is normalized to have zero mean and unit variance
         """
-        window = np.hanning(self.n_fft + 1)[:-1]
+        if attention_mask is not None:
+            attention_mask = np.array(attention_mask, np.int32)
+            normed_input_values = []
 
-        frames = self.fram_wave(waveform)
-        stft = self.stft(frames, window=window)
-        magnitudes = np.abs(stft[:, :-1]) ** 2
+            for vector, length in zip(input_values, attention_mask.sum(-1)):
+                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
+                if length < normed_slice.shape[0]:
+                    normed_slice[length:] = padding_value
 
-        filters = self.mel_filters
-        mel_spec = filters @ magnitudes
+                normed_input_values.append(normed_slice)
+        else:
+            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
+
+        return normed_input_values
+
+    def _random_mel_fusion(self, mel, total_frames, chunk_frames):
+        ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
+        if len(ranges[1]) == 0:
+            # if the audio is too short, we just use the first chunk
+            ranges[1] = [0]
+        if len(ranges[2]) == 0:
+            # if the audio is too short, we just use the first chunk
+            ranges[2] = [0]
+        # randomly choose index for each part
+        idx_front = np.random.choice(ranges[0])
+        idx_middle = np.random.choice(ranges[1])
+        idx_back = np.random.choice(ranges[2])
+        # select mel
+        mel_chunk_front = mel[idx_front : idx_front + chunk_frames, :]
+        mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]
+        mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
+
+        # shrink the mel TODO add this as a numpy function
+        mel_shrink = torchvision.transforms.Resize(size=[chunk_frames, 64])(mel[None])[0]
+        # logging.info(f"mel_shrink.shape: {mel_shrink.shape}")
+
+        # stack
+        mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink], dim=0)
+        return mel_fusion
+
+    def _get_audio_features(
+        self, waveform: np.array, max_length, padding, pad_to_multiple_of, truncation, filling
+    ) -> np.array:
+        """
+        Possible cases :
+            - wave > max_length
+                - rand_trun
+                - fusion
+            - wave < max_length
+                - repeat
+                - fusion
 
-        log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
-        log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
-        log_spec = (log_spec + 4.0) / 4.0
+        """
+        if len(waveform) > max_length:
+            if truncation == "rand_trunc":
+                longer = True
+            elif truncation == "fusion":
+                mel = self._np_extract_fbank_features(audio_data)
+                chunk_frames = max_length // self.hop_size + 1  # the +1 related to how the spectrogram is computed
+                total_frames = mel.shape[0]
+                if chunk_frames == total_frames:
+                    # there is a corner case where the audio length is larger than max_length but smaller than max_length+hop_size.
+                    # In this case, we just use the whole audio.
+                    input_mel = np.stack([mel, mel, mel, mel], dim=0)
+                    longer = False
+                else:
+                    input_mel = self._random_mel_fusion(mel, total_frames, chunk_frames)
+                    longer = True
 
-        return log_spec
+            else:
+                raise NotImplementedError(f"data_truncating {truncation} not implemented")
+            # random crop to max_length (for compatibility) -> this should be handled by self.pad
+            overflow = len(audio_data) - max_length
+            idx = np.random.randint(0, overflow + 1)
+            audio_data = audio_data[idx : idx + max_length]
+        else:
+            longer = False
+            # only use repeat as a new possible value for padding. you repeat the audio before applying the usual max_length padding
+            if len(audio_data) < max_length and padding == "repeatpad":  # do nothing if equal
+                n_repeat = int(max_length / len(audio_data))
+                audio_data = audio_data.repeat(n_repeat)
+            else:
+                audio_data = self.pad(
+                    audio_data,
+                    padding=padding,
+                    max_length=max_length if max_length else self.n_samples,
+                    truncation=truncation,
+                    pad_to_multiple_of=pad_to_multiple_of,
+                )
+            if truncation == "fusion":
+                mel = self._np_extract_fbank_features(audio_data, self.mel_filters_slaney)
+                input_mel = np.stack([mel, mel, mel, mel], dim=0)
+            else:
+                input_mel = self._np_extract_fbank_features(
+                    audio_data, self.mel_filters_slaney
+                )  
+        return input_mel, longer
 
     def __call__(
         self,
@@ -237,7 +251,7 @@ def __call__(
             pad_to_multiple_of (`int`, *optional*, defaults to None):
                 If set will pad the sequence to a multiple of the provided value.
 
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                This is especially useful to enable the use of np.array Cores on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
@@ -255,7 +269,7 @@ def __call__(
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
                 - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'pt'`: Return PyTorch `torch.np.array` objects.
                 - `'np'`: Return Numpy `np.ndarray` objects.
             sampling_rate (`int`, *optional*):
                 The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
@@ -294,28 +308,39 @@ def __call__(
         if not is_batched:
             raw_speech = [np.asarray([raw_speech]).T]
 
-        batched_speech = BatchFeature({"input_features": raw_speech})
-
         # convert into correct format for padding
+        padded_inputs = [
+            self._get_audio_features(
+                waveform,
+                truncation,
+                pad_to_multiple_of,
+                padding,
+                max_length if max_length else self.max_length,
+            )
+            for waveform in input_features[0]
+        ]
+
+        input_mel = []
+        is_longer = []
+        for mel, longer in input_features:
+            input_mel.append(mel)
+            is_longer.append(longer)
+
+        if self.enable_fusion and is_longer.sum() == 0:
+            # if no audio is longer than 10s, then randomly select one audio to be longer
+            rand_idx = np.random.randint(0, len(input_features))
+            input_mel[rand_idx] = True
+
+        if isinstance(input_features[0]["mel"], List):
+            padded_inputs["input_features"] = [np.asarray(mel, dtype=np.float32) for feature in input_mel]
+        else:
+            padded_inputs["input_features"] = input_features
 
-        padded_inputs = self.pad(
-            batched_speech,
-            padding=padding,
-            max_length=max_length if max_length else self.n_samples,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-        )
         # make sure list is in array format
         input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
 
-        input_features = [self._np_extract_fbank_features(waveform) for waveform in input_features[0]]
-
-        if isinstance(input_features[0], List):
-            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
-        else:
-            padded_inputs["input_features"] = input_features
-
         if return_tensors is not None:
             padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+            is_longer = is_longer.convert_to_tensors(return_tensors)
 
-        return padded_inputs
+        return padded_inputs, is_longer
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index 79b1d95063d1..12a0d72dd279 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -61,8 +61,8 @@ def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
         Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
         and `kwargs` arguments to CLAPTokenizerFast's [`~CLAPTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
-        CLAPFeatureExtractor's [`~CLAPFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
+        CLAPFeatureExtractor's [`~CLAPFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
+        doctsring of the above two methods for more information.
 
         Args:
             text (`str`, `List[str]`, `List[List[str]]`):
@@ -70,9 +70,9 @@ def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
             audios (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a
-                number of channels, and T the sample length of the audio.
+                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
+                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
+                and T the sample length of the audio.
 
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
@@ -132,7 +132,8 @@ def model_input_names(self):
     @property
     def feature_extractor_class(self):
         warnings.warn(
-            "`feature_extractor_class` is deprecated and will be removed in v5. Use `feature_extractor_class` instead.",
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `feature_extractor_class`"
+            " instead.",
             FutureWarning,
         )
         return self.feature_extractor_class

From 45e7ce9c22d8c87c5c5bd35bc50afd3c258bfcd7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 09:47:17 +0000
Subject: [PATCH 013/197] fixup

---
 .../feature_extraction_sequence_utils.py      | 42 ++++-----
 .../models/clap/feature_extraction_clap.py    | 94 ++++++++++---------
 2 files changed, 72 insertions(+), 64 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 36fa9747561c..818fc8ee41f6 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -15,13 +15,12 @@
 """
  Sequence feature extraction class for common feature extractors to preprocess sequences.
 """
+import math
+import warnings
 from typing import Dict, List, Optional, Union
 
 import numpy as np
 from numpy.fft import fft
-import math
-
-import warnings
 
 from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from .utils import PaddingStrategy, TensorType, is_tf_tensor, is_torch_tensor, logging, to_numpy
@@ -368,15 +367,15 @@ def _get_padding_strategies(self, padding=False, max_length=None):
             )
 
         return padding_strategy
-    
+
     @staticmethod
     def hz_to_mel(freq: float, mel_scale: str = "htk") -> float:
         r"""Convert Hz to Mels.
 
         Args:
-            freqs (float): 
+            freqs (float):
                 Frequencies in Hz
-            mel_scale (str, *optional*): 
+            mel_scale (str, *optional*):
                 Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
 
         Returns:
@@ -455,7 +454,7 @@ def create_triangular_filterbank(
         # Adopted from Librosa
         # calculate the difference between each filter mid point and each stft freq point in hertz
         f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
-        slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1) # (n_freqs, n_filter + 2)
+        slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1)  # (n_freqs, n_filter + 2)
         # create overlapping triangles
         zero = np.zeros(1)
         down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_filter)
@@ -474,22 +473,22 @@ def get_mel_filter_banks(
         norm: Optional[str] = None,
         mel_scale: str = "htk",
     ) -> np.array:
-        r"""Create a frequency bin conversion matrix used to obtain the Mel Frequency Cepstral Coefficient. 
-        This is called a `mel filter bank`, and various implementation exist, which differ in the number of filters, 
+        r"""Create a frequency bin conversion matrix used to obtain the Mel Frequency Cepstral Coefficient.
+        This is called a `mel filter bank`, and various implementation exist, which differ in the number of filters,
         the shape of the filters, the way the filters are spaced, the bandwidth of
         the filters, and the manner in which the spectrum is warped. The goal of these features is to approximate the non-linear human perception
-        of the variation in pitch with respect to the frequency. 
+        of the variation in pitch with respect to the frequency.
         This code is heavily inspired from the `torchaudio` implementation, refer to XXX for more details.
-        
+
 
         Note:
-            We will try to specify which variation correspond to which MFCCs from the litterature. The main features are: 
+            We will try to specify which variation correspond to which MFCCs from the litterature. The main features are:
                 - MFCC FB-20: introduced in 1980 by Davis and Mermelstein [4]; Davis and Mermelstein assume sampling frequency of 10 kHz; speech bandwidth [0, 4600] Hz
                 - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) described in Young, 1995 [5]; Young uses a filter bank of 24 filters for speech bandwidth [0, 8000] Hz (sampling rate ≥ 16 kHz)
                 - MFCC FB-40: from the Auditory Toolbox for MATLAB [6] written by Slaney in 1998; Slaney assumes sampling rate of 16 kHz, and speech bandwidth [133, 6854] Hz
                 - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris, 2004 [3]; Skowronski and Harris assume sampling rate of 12.5 kHz and speech bandwidth [0, 6250] Hz
-            
-            
+
+
         Args:
             n_freqs (int): Number of frequencies to highlight/apply
             f_min (float): Minimum frequency (Hz)
@@ -528,7 +527,7 @@ def get_mel_filter_banks(
         if norm is not None and norm == "slaney":
             # Slaney-style mel is scaled to be approx constant energy per channel
             enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels])
-            filterbank *= np.expand_dims(enorm,0)
+            filterbank *= np.expand_dims(enorm, 0)
 
         if (filterbank.max(axis=0) == 0.0).any():
             warnings.warn(
@@ -538,7 +537,7 @@ def get_mel_filter_banks(
             )
 
         return filterbank
-        
+
     def _stft(self, frames, window):
         """
         Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
@@ -563,11 +562,10 @@ def _stft(self, frames, window):
                 np.multiply(frame, window, out=fft_signal[:frame_size])
             else:
                 fft_signal[:frame_size] = frame
-            # TODO can we use fftn on the other dimensions? 
+            # TODO can we use fftn on the other dimensions?
             data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
         return data.T
-        
-        
+
     def _fram_wave(self, waveform, center=True):
         """
         Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is
@@ -584,10 +582,10 @@ def _fram_wave(self, waveform, center=True):
                 end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
 
                 frame = waveform[start:end]
-                
+
                 # TODO can all of this be automatically replaced with np.pad(audio,self.n_fft // 2, self.n_fft // 2), mode=self.pad_mode)
                 # as we have an array of frames
-                
+
                 if start == 0:
                     padd_width = (-i + half_window, 0)
                     frame = np.pad(frame, pad_width=padd_width, mode="reflect")
@@ -605,4 +603,4 @@ def _fram_wave(self, waveform, center=True):
                     )
 
             frames.append(frame)
-        return np.stack(frames, 0)
\ No newline at end of file
+        return np.stack(frames, 0)
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 6605439893ec..d2e2624e3906 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -18,7 +18,6 @@
 
 import numpy as np
 import torchvision
-from numpy.fft import fft
 
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
@@ -27,6 +26,7 @@
 
 logger = logging.get_logger(__name__)
 
+
 # Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor with Whisper->CLAP
 class CLAPFeatureExtractor(SequenceFeatureExtractor):
     r"""
@@ -65,10 +65,10 @@ def __init__(
         n_fft=400,
         padding_value=0.0,
         return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
-        norm = None,
-        f_min:float =0,
-        f_max:float =14000,
-        top_db:int = None,
+        norm=None,
+        f_min: float = 0,
+        f_max: float = 14000,
+        top_db: int = None,
         mel_scale: str = "htk",
         **kwargs
     ):
@@ -85,16 +85,32 @@ def __init__(
         self.n_samples = chunk_length * sampling_rate
         self.nb_max_frames = self.n_samples // hop_length
         self.sampling_rate = sampling_rate
-        self.f_min = f_min # should be in super and would initialized them
-        self.f_max = f_max # should be in super and would initialized them
-        self.norm = norm # should be in super and would initialized them
-        self.mel_filters = self.get_mel_filter_banks(n_freqs = int(1+ n_fft//2), n_mels = feature_size, f_min = f_min, f_max = f_max, sample_rate = sampling_rate, norm = "htk", mel_scale = "htk")
-        self.mel_filters_slaney = self.get_mel_filter_banks(n_freqs = int(1+ n_fft//2), n_mels = feature_size, f_min = f_min, f_max = f_max, sample_rate = sampling_rate, norm = "slaney", mel_scale = "slaney")
+        self.f_min = f_min  # should be in super and would initialized them
+        self.f_max = f_max  # should be in super and would initialized them
+        self.norm = norm  # should be in super and would initialized them
+        self.mel_filters = self.get_mel_filter_banks(
+            n_freqs=int(1 + n_fft // 2),
+            n_mels=feature_size,
+            f_min=f_min,
+            f_max=f_max,
+            sample_rate=sampling_rate,
+            norm="htk",
+            mel_scale="htk",
+        )
+        self.mel_filters_slaney = self.get_mel_filter_banks(
+            n_freqs=int(1 + n_fft // 2),
+            n_mels=feature_size,
+            f_min=f_min,
+            f_max=f_max,
+            sample_rate=sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
         self.top_db = top_db
-    
+
     def _power_to_db(self, mel_spectrogram, a_min=1e-10, ref=1.0):
-        """ 
-        Power to db, this function is the numpy implementation of 
+        """
+        Power to db, this function is the numpy implementation of
         librosa.power_to_lb
         """
         log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
@@ -102,10 +118,10 @@ def _power_to_db(self, mel_spectrogram, a_min=1e-10, ref=1.0):
         if self.top_db is not None:
             if self.top_db < 0:
                 raise ValueError("top_db must be non-negative")
-            log_spec = np.clip(log_spec,  min=np.maximum(log_spec) - self.top_db, max=np.inf)
+            log_spec = np.clip(log_spec, min=np.maximum(log_spec) - self.top_db, max=np.inf)
         return log_spec
-    
-    def _np_extract_fbank_features(self, waveform: np.array, mel_filters:Optional[np.array]) -> np.ndarray:
+
+    def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array]) -> np.ndarray:
         """
         Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
         implementation with 1e-5 tolerance.
@@ -114,7 +130,7 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters:Optional[np
 
         frames = self._fram_wave(waveform)
         stft = self._stft(frames, window=window)
-        
+
         # if the imaginary parts are taken : (real, imag) = stftl; real ** 2 + imag ** 2
         magnitudes = np.abs(stft) ** 2
         mel_spec = np.matmul(magnitudes, self.mel_filters)
@@ -186,7 +202,7 @@ def _get_audio_features(
             if truncation == "rand_trunc":
                 longer = True
             elif truncation == "fusion":
-                mel = self._np_extract_fbank_features(audio_data)
+                mel = self._np_extract_fbank_features(waveform)
                 chunk_frames = max_length // self.hop_size + 1  # the +1 related to how the spectrogram is computed
                 total_frames = mel.shape[0]
                 if chunk_frames == total_frames:
@@ -201,30 +217,28 @@ def _get_audio_features(
             else:
                 raise NotImplementedError(f"data_truncating {truncation} not implemented")
             # random crop to max_length (for compatibility) -> this should be handled by self.pad
-            overflow = len(audio_data) - max_length
+            overflow = len(waveform) - max_length
             idx = np.random.randint(0, overflow + 1)
-            audio_data = audio_data[idx : idx + max_length]
+            waveform = waveform[idx : idx + max_length]
         else:
             longer = False
             # only use repeat as a new possible value for padding. you repeat the audio before applying the usual max_length padding
-            if len(audio_data) < max_length and padding == "repeatpad":  # do nothing if equal
-                n_repeat = int(max_length / len(audio_data))
-                audio_data = audio_data.repeat(n_repeat)
+            if len(waveform) < max_length and padding == "repeatpad":  # do nothing if equal
+                n_repeat = int(max_length / len(waveform))
+                waveform = waveform.repeat(n_repeat)
             else:
-                audio_data = self.pad(
-                    audio_data,
+                waveform = self.pad(
+                    waveform,
                     padding=padding,
                     max_length=max_length if max_length else self.n_samples,
                     truncation=truncation,
                     pad_to_multiple_of=pad_to_multiple_of,
                 )
             if truncation == "fusion":
-                mel = self._np_extract_fbank_features(audio_data, self.mel_filters_slaney)
+                mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
                 input_mel = np.stack([mel, mel, mel, mel], dim=0)
             else:
-                input_mel = self._np_extract_fbank_features(
-                    audio_data, self.mel_filters_slaney
-                )  
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
         return input_mel, longer
 
     def __call__(
@@ -317,30 +331,26 @@ def __call__(
                 padding,
                 max_length if max_length else self.max_length,
             )
-            for waveform in input_features[0]
+            for waveform in raw_speech
         ]
 
         input_mel = []
         is_longer = []
-        for mel, longer in input_features:
+        for mel, longer in padded_inputs:
             input_mel.append(mel)
             is_longer.append(longer)
 
         if self.enable_fusion and is_longer.sum() == 0:
             # if no audio is longer than 10s, then randomly select one audio to be longer
-            rand_idx = np.random.randint(0, len(input_features))
-            input_mel[rand_idx] = True
-
-        if isinstance(input_features[0]["mel"], List):
-            padded_inputs["input_features"] = [np.asarray(mel, dtype=np.float32) for feature in input_mel]
-        else:
-            padded_inputs["input_features"] = input_features
+            rand_idx = np.random.randint(0, len(input_mel))
+            is_longer[rand_idx] = True
 
-        # make sure list is in array format
-        input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
+        if isinstance(input_mel[0], List):
+            input_mel = [np.asarray(mel, dtype=np.float32) for feature in input_mel]
 
         if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+            input_mel = input_mel.convert_to_tensors(return_tensors)
             is_longer = is_longer.convert_to_tensors(return_tensors)
 
-        return padded_inputs, is_longer
+        input_features = {"input_features": input_mel, "is_longer": is_longer}
+        return input_features

From b5c483fbb650ee1391473a1f2b7dea9a05058b7d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 09:49:13 +0000
Subject: [PATCH 014/197] remove whisper copioed from

---
 .../feature_extraction_sequence_utils.py      | 31 ++++++++++---------
 .../models/clap/feature_extraction_clap.py    |  9 +++---
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 818fc8ee41f6..26a4c6856bd6 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -475,18 +475,23 @@ def get_mel_filter_banks(
     ) -> np.array:
         r"""Create a frequency bin conversion matrix used to obtain the Mel Frequency Cepstral Coefficient.
         This is called a `mel filter bank`, and various implementation exist, which differ in the number of filters,
-        the shape of the filters, the way the filters are spaced, the bandwidth of
-        the filters, and the manner in which the spectrum is warped. The goal of these features is to approximate the non-linear human perception
-        of the variation in pitch with respect to the frequency.
-        This code is heavily inspired from the `torchaudio` implementation, refer to XXX for more details.
+        the shape of the filters, the way the filters are spaced, the bandwidth of the filters, and the manner in which
+        the spectrum is warped. The goal of these features is to approximate the non-linear human perception of the
+        variation in pitch with respect to the frequency. This code is heavily inspired from the `torchaudio`
+        implementation, refer to XXX for more details.
 
 
         Note:
-            We will try to specify which variation correspond to which MFCCs from the litterature. The main features are:
-                - MFCC FB-20: introduced in 1980 by Davis and Mermelstein [4]; Davis and Mermelstein assume sampling frequency of 10 kHz; speech bandwidth [0, 4600] Hz
-                - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) described in Young, 1995 [5]; Young uses a filter bank of 24 filters for speech bandwidth [0, 8000] Hz (sampling rate ≥ 16 kHz)
-                - MFCC FB-40: from the Auditory Toolbox for MATLAB [6] written by Slaney in 1998; Slaney assumes sampling rate of 16 kHz, and speech bandwidth [133, 6854] Hz
-                - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris, 2004 [3]; Skowronski and Harris assume sampling rate of 12.5 kHz and speech bandwidth [0, 6250] Hz
+            We will try to specify which variation correspond to which MFCCs from the litterature. The main features
+            are:
+                - MFCC FB-20: introduced in 1980 by Davis and Mermelstein [4]; Davis and Mermelstein assume sampling
+                  frequency of 10 kHz; speech bandwidth [0, 4600] Hz
+                - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) described in Young, 1995 [5]; Young uses a
+                  filter bank of 24 filters for speech bandwidth [0, 8000] Hz (sampling rate ≥ 16 kHz)
+                - MFCC FB-40: from the Auditory Toolbox for MATLAB [6] written by Slaney in 1998; Slaney assumes
+                  sampling rate of 16 kHz, and speech bandwidth [133, 6854] Hz
+                - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris, 2004 [3]; Skowronski and
+                  Harris assume sampling rate of 12.5 kHz and speech bandwidth [0, 6250] Hz
 
 
         Args:
@@ -500,11 +505,9 @@ def get_mel_filter_banks(
             mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
 
         Returns:
-            Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``)
-            meaning number of frequencies to highlight/apply to x the number of filterbanks.
-            Each column is a filterbank so that assuming there is a matrix A of
-            size (..., ``n_freqs``), the applied result would be
-            ``A * melscale_fbanks(A.size(-1), ...)``.
+            Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``) meaning number of frequencies
+            to highlight/apply to x the number of filterbanks. Each column is a filterbank so that assuming there is a
+            matrix A of size (..., ``n_freqs``), the applied result would be ``A * melscale_fbanks(A.size(-1), ...)``.
 
         """
 
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index d2e2624e3906..162d342a38d3 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -27,7 +27,6 @@
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor with Whisper->CLAP
 class CLAPFeatureExtractor(SequenceFeatureExtractor):
     r"""
     Constructs a CLAP feature extractor.
@@ -110,8 +109,7 @@ def __init__(
 
     def _power_to_db(self, mel_spectrogram, a_min=1e-10, ref=1.0):
         """
-        Power to db, this function is the numpy implementation of
-        librosa.power_to_lb
+        Power to db, this function is the numpy implementation of librosa.power_to_lb
         """
         log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
         log_spec -= 10.0 * np.log10(np.maximum(a_min, ref))
@@ -265,8 +263,9 @@ def __call__(
             pad_to_multiple_of (`int`, *optional*, defaults to None):
                 If set will pad the sequence to a multiple of the provided value.
 
-                This is especially useful to enable the use of np.array Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+                This is especially useful to enable the use of np.array Cores on NVIDIA hardware with compute
+                capability `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of
+                128.
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific feature_extractor's default.

From fc0d32365347147b516aece5c347ff92564111aa Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 2 Feb 2023 11:09:19 +0000
Subject: [PATCH 015/197] audio logits match

---
 .../models/clap/configuration_clap.py         | 34 ++++----
 src/transformers/models/clap/modeling_clap.py | 83 +++++++++++++------
 2 files changed, 74 insertions(+), 43 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 5d065d294798..1a45555cd260 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -326,7 +326,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
         # get the vision config dict if we are loading from CLAPConfig
         if config_dict.get("model_type") == "clap":
-            config_dict = config_dict["vision_config"]
+            config_dict = config_dict["audio_config"]
 
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
@@ -350,7 +350,7 @@ class CLAPConfig(PretrainedConfig):
     Args:
         text_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`CLAPTextConfig`].
-        vision_config (`dict`, *optional*):
+        audio_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`CLAPAudioConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
             Dimentionality of text and vision projection layers.
@@ -380,7 +380,7 @@ class CLAPConfig(PretrainedConfig):
     >>> config_text = CLAPTextConfig()
     >>> config_vision = CLAPAudioConfig()
 
-    >>> config = CLAPConfig.from_text_vision_configs(config_text, config_vision)
+    >>> config = CLAPConfig.from_text_audio_configs(config_text, config_vision)
     ```"""
 
     model_type = "clap"
@@ -389,7 +389,7 @@ class CLAPConfig(PretrainedConfig):
     def __init__(
         self,
         text_config=None,
-        vision_config=None,
+        audio_config=None,
         logit_scale_init_value=(1 / 0.07),
         fusion_num_hidden_layers=2,
         projection_dim=512,
@@ -400,31 +400,31 @@ def __init__(
 
         # If `_config_dict` exist, we use them for the backward compatibility.
         text_config_dict = kwargs.pop("text_config_dict", None)
-        vision_config_dict = kwargs.pop("vision_config_dict", None)
+        audio_config_dict = kwargs.pop("audio_config_dict", None)
         if text_config_dict is not None:
             text_config = text_config_dict
-        if vision_config_dict is not None:
-            vision_config = vision_config_dict
+        if audio_config_dict is not None:
+            audio_config = audio_config_dict
 
         if text_config is None:
             text_config = {}
             logger.info("text_config is None. Initializing the CLAPTextConfig with default values.")
 
-        if vision_config is None:
-            vision_config = {}
-            logger.info("vision_config is None. initializing the CLAPAudioConfig with default values.")
+        if audio_config is None:
+            audio_config = {}
+            logger.info("audio_config is None. initializing the CLAPAudioConfig with default values.")
 
         self.text_config = CLAPTextConfig(**text_config)
-        self.vision_config = CLAPAudioConfig(**vision_config)
+        self.audio_config = CLAPAudioConfig(**audio_config)
 
         self.text_config.fusion_num_hidden_layers = fusion_num_hidden_layers
-        self.vision_config.fusion_num_hidden_layers = fusion_num_hidden_layers
+        self.audio_config.fusion_num_hidden_layers = fusion_num_hidden_layers
 
         self.text_config.projection_dim = projection_dim
-        self.vision_config.projection_dim = projection_dim
+        self.audio_config.projection_dim = projection_dim
 
         self.text_config.projection_hidden_act = projection_hidden_act
-        self.vision_config.projection_hidden_act = projection_hidden_act
+        self.audio_config.projection_hidden_act = projection_hidden_act
 
         self.projection_dim = projection_dim
         self.projection_hidden_act = projection_hidden_act
@@ -434,7 +434,7 @@ def __init__(
         self.initializer_factor = 1.0
 
     @classmethod
-    def from_text_vision_configs(cls, text_config: CLAPTextConfig, vision_config: CLAPAudioConfig, **kwargs):
+    def from_text_audio_configs(cls, text_config: CLAPTextConfig, audio_config: CLAPAudioConfig, **kwargs):
         r"""
         Instantiate a [`CLAPConfig`] (or a derived class) from clap text model configuration and clap vision model
         configuration.
@@ -443,7 +443,7 @@ def from_text_vision_configs(cls, text_config: CLAPTextConfig, vision_config: CL
             [`CLAPConfig`]: An instance of a configuration object
         """
 
-        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+        return cls(text_config=text_config.to_dict(), audio_config=audio_config.to_dict(), **kwargs)
 
     def to_dict(self):
         """
@@ -454,6 +454,6 @@ def to_dict(self):
         """
         output = copy.deepcopy(self.__dict__)
         output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
+        output["audio_config"] = self.audio_config.to_dict()
         output["model_type"] = self.__class__.model_type
         return output
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index a5743c791a38..1ad554bc8d27 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -135,6 +135,27 @@ class CLAPTextModelOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
+@dataclass
+class CLAPAudioModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        framewise_output (`torch.FloatTensor` of shape `(batch_size, num_frames, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        clipwise_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        fine_grained_embedding (`torch.FloatTensor` of shape `(batch_size, num_frames, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+    """
+    framewise_output: torch.FloatTensor = None
+    clipwise_output: torch.FloatTensor = None
+    fine_grained_embedding: torch.FloatTensor = None
+    embedding: torch.FloatTensor = None
+
+
 @dataclass
 # Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLAP
 class CLAPOutput(ModelOutput):
@@ -278,7 +299,8 @@ def __init__(self, config: CLAPAudioConfig):
         self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1])
         self.num_patches = self.grid_size[0] * self.grid_size[1]
         self.flatten = config.flatten_patch_embeds
-        self.enable_fusion = config.enable_patch_fusion
+        self.enable_patch_fusion = config.enable_patch_fusion
+        self.enable_fusion = config.enable_fusion
         self.fusion_type = config.fusion_type
 
         padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)
@@ -293,9 +315,12 @@ def __init__(self, config: CLAPAudioConfig):
             padding=padding,
         )
 
+
         self.norm = nn.LayerNorm(config.patch_embeds_hidden_size) if config.enable_patch_layer_norm else nn.Identity()
-        if self.enable_fusion:
+        if self.enable_patch_fusion:
             self.fusion_model = CLAPAudioAFFBlock(config)
+            self.mel_conv2d = nn.Conv2d(config.patch_embed_input_channels, config.patch_embeds_hidden_size, kernel_size=(patch_size[0], patch_size[1]*3), stride=(patch_stride[0], patch_stride[1] * 3), padding=padding)
+
 
     def forward(self, x, longer_idx=None):
         if self.enable_fusion:
@@ -1669,29 +1694,29 @@ def __init__(self, config: CLAPConfig):
                 f" {type(config.text_config)}."
             )
 
-        if not isinstance(config.vision_config, CLAPAudioConfig):
+        if not isinstance(config.audio_config, CLAPAudioConfig):
             raise ValueError(
-                "config.vision_config is expected to be of type CLAPAudioConfig but is of type"
-                f" {type(config.vision_config)}."
+                "config.audio_config is expected to be of type CLAPAudioConfig but is of type"
+                f" {type(config.audio_config)}."
             )
 
         text_config = config.text_config
-        vision_config = config.vision_config
+        audio_config = config.audio_config
 
         self.logit_scale_a = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value))
         self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value))
 
         self.projection_dim = config.projection_dim
         self.text_hidden_size = text_config.hidden_size
-        self.vision_hidden_size = vision_config.hidden_size
+        self.vision_hidden_size = audio_config.hidden_size
 
         self.text_model = CLAPTextModel(text_config)
         self.text_transform = CLAPFusionLayer(text_config)
         self.text_projection = CLAPProjectionLayer(text_config)
 
-        self.audio_model = CLAPSwinTransformer(config=vision_config)
-        self.audio_transform = CLAPFusionLayer(vision_config)
-        self.audio_projection = CLAPProjectionLayer(vision_config)
+        self.audio_model = CLAPSwinTransformer(config=audio_config)
+        self.audio_transform = CLAPFusionLayer(audio_config)
+        self.audio_projection = CLAPProjectionLayer(audio_config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1767,13 +1792,10 @@ def get_audio_features(
             mel_fusion=mel_fusion,
             longer=longer,
             waveform=waveform,
-            # attention_mask=attention_mask,
-            # output_attentions=output_attentions,
-            # output_hidden_states=output_hidden_states,
-            # return_dict=return_dict,
+            return_dict=return_dict,
         )
 
-        pooled_output = audio_outputs[1]
+        pooled_output = audio_outputs[-1] if not return_dict else audio_outputs.embedding
 
         audio_features = self.audio_projection(pooled_output)
         audio_features = F.normalize(audio_features, dim=-1)
@@ -2196,14 +2218,8 @@ def forward_features(self, x, longer_idx=None):
         x = self.avgpool(x)
         x = torch.flatten(x, 1)
 
-        output_dict = {
-            "framewise_output": fpx,  # already sigmoided
-            "clipwise_output": torch.sigmoid(x),
-            "fine_grained_embedding": fine_grained_latent_output,
-            "embedding": latent_output,
-        }
+        return (fpx, torch.sigmoid(x), fine_grained_latent_output, latent_output)
 
-        return output_dict
 
     def crop_wav(self, x, crop_size, spe_pos=None):
         time_steps = x.shape[2]
@@ -2240,8 +2256,12 @@ def forward(
         longer=None,
         waveform=None,
         mixup_lambda=None, 
-        device=None
+        device=None,
+        return_dict=False,
     ):  # out_feat_keys: List[str] = None):
+        mel_fusion = mel_fusion[None, :].to(0)
+        waveform = waveform[None, :].to(0)
+
 
         if self.enable_fusion and longer.sum() == 0:
             # if no audio is longer than 10s, then randomly select one audio to be longer
@@ -2261,7 +2281,7 @@ def forward(
                 x = do_mixup(x, mixup_lambda)
 
             x = self.reshape_wav2img(x)
-            output_dict = self.forward_features(x)
+            output = self.forward_features(x)
         else:
             longer_list = longer.to(device=device, non_blocking=True)
             x = mel_fusion.to(device=device, non_blocking=True)
@@ -2277,6 +2297,17 @@ def forward(
                 x = do_mixup(x, mixup_lambda)
 
             x = self.reshape_wav2img(x)
-            output_dict = self.forward_features(x, longer_idx=longer_list_idx)
+            output = self.forward_features(x, longer_idx=longer_list_idx)
+
+        if not return_dict:
+            return output
+
+        framewise_output, clipwise_output, fine_grained_embedding, output_embeddingss = output
+
+        return CLAPAudioModelOutput(
+            framewise_output=framewise_output,
+            clipwise_output=clipwise_output,
+            fine_grained_embedding=fine_grained_embedding,
+            embedding=output_embeddingss,
+        )
 
-        return output_dict

From 8a2772345dd4763cfa513b96f9d6e924c71f1f1e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 11:27:19 +0000
Subject: [PATCH 016/197] add doc

---
 .../feature_extraction_sequence_utils.py      | 85 ++++++++++++++-----
 1 file changed, 62 insertions(+), 23 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 26a4c6856bd6..44906fb72cd4 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -473,7 +473,8 @@ def get_mel_filter_banks(
         norm: Optional[str] = None,
         mel_scale: str = "htk",
     ) -> np.array:
-        r"""Create a frequency bin conversion matrix used to obtain the Mel Frequency Cepstral Coefficient.
+        r"""
+        Create a frequency bin conversion matrix used to obtain the Mel Frequency Cepstral Coefficient.
         This is called a `mel filter bank`, and various implementation exist, which differ in the number of filters,
         the shape of the filters, the way the filters are spaced, the bandwidth of the filters, and the manner in which
         the spectrum is warped. The goal of these features is to approximate the non-linear human perception of the
@@ -495,14 +496,20 @@ def get_mel_filter_banks(
 
 
         Args:
-            n_freqs (int): Number of frequencies to highlight/apply
-            f_min (float): Minimum frequency (Hz)
-            f_max (float): Maximum frequency (Hz)
-            n_mels (int): Number of mel filterbanks
-            sample_rate (int): Sample rate of the audio waveform
-            norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
-                (area normalization). (Default: ``None``)
-            mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+            n_freqs (int):
+                Number of frequencies to highlight/apply
+            f_min (float):
+                Minimum frequency (Hz)
+            f_max (float): 
+                Maximum frequency (Hz)
+            n_mels (int): 
+                Number of mel filterbanks
+            sample_rate (int): 
+                Sample rate of the audio waveform
+            norm (str or None, optional): 
+                If "slaney", divide the triangular mel weights by the width of the mel band (area normalization). (Default: ``None``)
+            mel_scale (str, optional): 
+                Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
 
         Returns:
             Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``) meaning number of frequencies
@@ -543,8 +550,17 @@ def get_mel_filter_banks(
 
     def _stft(self, frames, window):
         """
-        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
-        results as `torch.stft`.
+        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the
+        same results as `torch.stft`.
+
+        Args:
+            frames (`np.array` of dimension `(num_frames, self.n_fft)`):
+                A framed audio signal obtained using `self._fram_wav`.
+            window (`np.array` of dimension `(self.n_freqs, self.n_mels)`:
+                A array reprensenting the function that will be used to reduces the amplitude of the
+                discontinuities at the boundaries of each frame when computing the FFT. Each frame will 
+                be multiplied by the window. For more information on this phenomena, called *Spectral leakage*, 
+                refer to [this tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
         """
         frame_size = frames.shape[1]
         fft_size = self.n_fft
@@ -565,30 +581,51 @@ def _stft(self, frames, window):
                 np.multiply(frame, window, out=fft_signal[:frame_size])
             else:
                 fft_signal[:frame_size] = frame
-            # TODO can we use fftn on the other dimensions?
             data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
         return data.T
 
-    def _fram_wave(self, waveform, center=True):
+    def _power_to_db(self, mel_spectrogram, a_min=1e-10, ref=1.0):
         """
-        Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is
-        contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each
-        new frame.
+        Convert a mel spectrogram from power to db, this function is the numpy implementation of librosa.power_to_lb.
+        """
+        log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
+        log_spec -= 10.0 * np.log10(np.maximum(a_min, ref))
+        if self.top_db is not None:
+            if self.top_db < 0:
+                raise ValueError("top_db must be non-negative")
+            log_spec = np.clip(log_spec, min=np.maximum(log_spec) - self.top_db, max=np.inf)
+        return log_spec
+
+    def _fram_wave(self, waveform: np.array, center: bool = True):
+        """
+        In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
+        segments called `frames`.
+
+        The window length (self.window_length) defines how much of the signal is contained in each frame, while the hop
+        length defines the step between the beginning of each new frame.
 
-        Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`.
+        **This method does not support batching yet as we are mainly focus on inference. If you want this to be added
+        feel free to open an issue and ping @arthurzucker on Github**
+
+        Args:
+            waveform (`np.array`) of shape (sample_length,):
+                The raw waveform which will be split into smaller chunks.
+            center (`bool`, defaults to `True`):
+                Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the
+                waveform on the left and on the right.
+
+        Return:
+            framed_waveform (`np.array` of shape (waveform.shape // self.hop_length , self.n_fft)):
+                The framed waveforms that can be fed `np.fft`.
         """
+
         frames = []
         for i in range(0, waveform.shape[0] + 1, self.hop_length):
             half_window = (self.n_fft - 1) // 2 + 1
             if center:
                 start = i - half_window if i > half_window else 0
                 end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
-
                 frame = waveform[start:end]
-
-                # TODO can all of this be automatically replaced with np.pad(audio,self.n_fft // 2, self.n_fft // 2), mode=self.pad_mode)
-                # as we have an array of frames
-
                 if start == 0:
                     padd_width = (-i + half_window, 0)
                     frame = np.pad(frame, pad_width=padd_width, mode="reflect")
@@ -606,4 +643,6 @@ def _fram_wave(self, waveform, center=True):
                     )
 
             frames.append(frame)
-        return np.stack(frames, 0)
+        framed_waveform = np.stack(frames, 0)
+
+        return framed_waveform

From 27f133f04d037f97ce4e736c61bcfd90e3bc5e09 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 11:28:27 +0000
Subject: [PATCH 017/197] correct filters mel and add maxlength

---
 .../models/clap/configuration_clap.py         |  9 +++------
 .../models/clap/feature_extraction_clap.py    | 19 +++++--------------
 2 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 5d065d294798..80dea55991b8 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -254,16 +254,13 @@ def __init__(
         aff_block_r=4,
         enable_patch_fusion=False,
         spectrogram_window_size=1024,
-        spectrogram_window='hann',
-        spectrogram_center=True,
-        spectrogram_pad_mode='reflect',
         spectrogram_freeze_parameters=True,
         spectrogram_ref=1.0,
         spectrogram_amin=1e-10,
         spectrogram_top_db=None,
-        spectrogram_time_drop_width=64, 
-        spectrogram_time_stripes_num=2, 
-        spectrogram_freq_drop_width=8, 
+        spectrogram_time_drop_width=64,
+        spectrogram_time_stripes_num=2,
+        spectrogram_freq_drop_width=8,
         spectrogram_freq_stripes_num=2,
         **kwargs
     ):
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 162d342a38d3..67711bd0e7b7 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -68,7 +68,7 @@ def __init__(
         f_min: float = 0,
         f_max: float = 14000,
         top_db: int = None,
-        mel_scale: str = "htk",
+        max_length: int = 48000,
         **kwargs
     ):
         super().__init__(
@@ -78,6 +78,7 @@ def __init__(
             return_attention_mask=return_attention_mask,
             **kwargs,
         )
+        self.max_length = max_length
         self.n_fft = n_fft
         self.hop_length = hop_length
         self.chunk_length = chunk_length
@@ -93,7 +94,7 @@ def __init__(
             f_min=f_min,
             f_max=f_max,
             sample_rate=sampling_rate,
-            norm="htk",
+            norm=None,
             mel_scale="htk",
         )
         self.mel_filters_slaney = self.get_mel_filter_banks(
@@ -107,17 +108,7 @@ def __init__(
         )
         self.top_db = top_db
 
-    def _power_to_db(self, mel_spectrogram, a_min=1e-10, ref=1.0):
-        """
-        Power to db, this function is the numpy implementation of librosa.power_to_lb
-        """
-        log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
-        log_spec -= 10.0 * np.log10(np.maximum(a_min, ref))
-        if self.top_db is not None:
-            if self.top_db < 0:
-                raise ValueError("top_db must be non-negative")
-            log_spec = np.clip(log_spec, min=np.maximum(log_spec) - self.top_db, max=np.inf)
-        return log_spec
+
 
     def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array]) -> np.ndarray:
         """
@@ -184,7 +175,7 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         return mel_fusion
 
     def _get_audio_features(
-        self, waveform: np.array, max_length, padding, pad_to_multiple_of, truncation, filling
+        self, waveform: np.array, max_length, padding, pad_to_multiple_of, truncation
     ) -> np.array:
         """
         Possible cases :

From 5ddc2f3e3629bb3feffa57d2f024d41c5235bc48 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 11:28:49 +0000
Subject: [PATCH 018/197] style

---
 src/transformers/models/clap/modeling_clap.py | 51 ++++++++-----------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index a5743c791a38..b86f0e0e9d38 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -27,8 +27,8 @@
 from torch import nn
 from torch.nn.init import _calculate_fan_in_and_fan_out
 
-from torchlibrosa.stft import Spectrogram, LogmelFilterBank
 from torchlibrosa.augmentation import SpecAugmentation
+from torchlibrosa.stft import LogmelFilterBank, Spectrogram
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
@@ -1755,8 +1755,7 @@ def get_audio_features(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> torch.FloatTensor:
-        r"""
-        """
+        r""" """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -2118,33 +2117,33 @@ def __init__(self, config: CLAPAudioConfig):
         self.head = nn.Linear(self.num_classes, self.num_classes)
 
         self.spectrogram_extractor = Spectrogram(
-            n_fft=config.spectrogram_window_size, 
-            hop_length=config.hop_size, 
-            win_length=config.spectrogram_window_size, 
-            window=config.spectrogram_window, 
-            center=config.spectrogram_center, 
-            pad_mode=config.spectrogram_pad_mode, 
-            freeze_parameters=config.spectrogram_freeze_parameters
+            n_fft=config.spectrogram_window_size,
+            hop_length=config.hop_size,
+            win_length=config.spectrogram_window_size,
+            window=config.spectrogram_window,
+            center=config.spectrogram_center,
+            pad_mode=config.spectrogram_pad_mode,
+            freeze_parameters=config.spectrogram_freeze_parameters,
         )
         # Logmel feature extractor
         self.logmel_extractor = LogmelFilterBank(
-            sr=config.sample_rate, 
-            n_fft=config.spectrogram_window_size, 
-            n_mels=config.mel_bins, 
-            fmin=config.fmin, 
-            fmax=config.fmax, 
-            ref=config.spectrogram_ref, 
-            amin=config.spectrogram_amin, 
-            top_db=config.spectrogram_top_db, 
+            sr=config.sample_rate,
+            n_fft=config.spectrogram_window_size,
+            n_mels=config.mel_bins,
+            fmin=config.fmin,
+            fmax=config.fmax,
+            ref=config.spectrogram_ref,
+            amin=config.spectrogram_amin,
+            top_db=config.spectrogram_top_db,
             freeze_parameters=config.spectrogram_freeze_parameters,
         )
         # Spec augmenter
         self.spec_augmenter = SpecAugmentation(
-            time_drop_width=config.spectrogram_time_drop_width, 
-            time_stripes_num=config.spectrogram_time_stripes_num, 
-            freq_drop_width=config.spectrogram_freq_drop_width, 
+            time_drop_width=config.spectrogram_time_drop_width,
+            time_stripes_num=config.spectrogram_time_stripes_num,
+            freq_drop_width=config.spectrogram_freq_drop_width,
             freq_stripes_num=config.spectrogram_freq_stripes_num,
-        ) 
+        )
 
     def _init_weights(self, m):
         pass
@@ -2235,12 +2234,7 @@ def reshape_wav2img(self, x):
         return x
 
     def forward(
-        self, 
-        mel_fusion=None,
-        longer=None,
-        waveform=None,
-        mixup_lambda=None, 
-        device=None
+        self, mel_fusion=None, longer=None, waveform=None, mixup_lambda=None, device=None
     ):  # out_feat_keys: List[str] = None):
 
         if self.enable_fusion and longer.sum() == 0:
@@ -2270,7 +2264,6 @@ def forward(
             x = x.transpose(1, 3)
             longer_list_idx = torch.where(longer_list)[0]
 
-
             if self.training:
                 x = self.spec_augmenter(x)
             if self.training and mixup_lambda is not None:

From 815c5cefebc43672d2d54beba744bbc0382b7427 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 2 Feb 2023 12:49:37 +0000
Subject: [PATCH 019/197] few fixes

---
 .../models/clap/configuration_clap.py         |   6 +-
 src/transformers/models/clap/modeling_clap.py | 299 ++++++++----------
 2 files changed, 131 insertions(+), 174 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 7f470ff5e352..563986196a62 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -304,9 +304,9 @@ def __init__(
         self.aff_block_r = aff_block_r
         self.enable_patch_fusion = enable_patch_fusion
         self.spectrogram_window_size = spectrogram_window_size
-        self.spectrogram_window = spectrogram_window
-        self.spectrogram_center = spectrogram_center
-        self.spectrogram_pad_mode = spectrogram_pad_mode
+        # self.spectrogram_window = spectrogram_window
+        # self.spectrogram_center = spectrogram_center
+        # self.spectrogram_pad_mode = spectrogram_pad_mode
         self.spectrogram_freeze_parameters = spectrogram_freeze_parameters
         self.spectrogram_ref = spectrogram_ref
         self.spectrogram_amin = spectrogram_amin
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 33e193447037..34f1efa27119 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -37,7 +37,7 @@
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import apply_chunking_to_forward
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -194,50 +194,43 @@ def to_tuple(self) -> Tuple[Any]:
         )
 
 
-# from PyTorch internals
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, collections.abc.Iterable):
-            return x
-        return tuple(repeat(x, n))
-
-    return parse
-
-
-to_1tuple = _ntuple(1)
-to_2tuple = _ntuple(2)
-to_3tuple = _ntuple(3)
-to_4tuple = _ntuple(4)
-to_ntuple = _ntuple
-
-
-def drop_path(x, drop_prob: float = 0.0, training: bool = False):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+class CLAPDropPath(nn.Module):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
     This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, the original name is
     misleading as 'Drop Connect' is a different form of dropout in a separate paper... See discussion:
     https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the layer and
     argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the argument.
     """
-    if drop_prob == 0.0 or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
-    random_tensor.floor_()  # binarize
-    output = x.div(keep_prob) * random_tensor
-    return output
+    def __init__(self, drop_prob=None):
+        super(CLAPDropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states):
+        if self.drop_prob == 0.0 or not self.training:
+            return hidden_states
+        
+        keep_prob = 1 - self.drop_prob
+        shape = (hidden_states.shape[0],) + (1,) * (hidden_states.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+        
+        random_tensor = keep_prob + torch.rand(shape, dtype=hidden_states.dtype, device=hidden_states.device)
+        random_tensor.floor_()  # binarize
+        output = hidden_states.div(keep_prob) * random_tensor
+        return output
+
 
 
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133
 class CLAPAudioAFFBlock(nn.Module):
     r"""
-    TODO: add docstring
+    AFF Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to
+    implement the 1D version.
     """
-
     def __init__(self, config: CLAPAudioConfig):
         super(CLAPAudioAFFBlock, self).__init__()
         channels = config.patch_embeds_hidden_size
-        r = config.aff_block_r
-        inter_channels = int(channels // r)
+        downsize_ratio = config.aff_block_r
+        inter_channels = int(channels // downsize_ratio)
 
         self.local_att = nn.Sequential(
             nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
@@ -257,31 +250,14 @@ def __init__(self, config: CLAPAudioConfig):
 
         self.sigmoid = nn.Sigmoid()
 
-    def forward(self, x, residual):
-        flag = False
-        xa = x + residual
-        if xa.size(0) == 1:
-            xa = torch.cat([xa, xa], dim=0)
-            flag = True
-        xl = self.local_att(xa)
-        xg = self.global_att(xa)
-        xlg = xl + xg
-        wei = self.sigmoid(xlg)
-        xo = 2 * x * wei + 2 * residual * (1 - wei)
-        if flag:
-            xo = xo[0].unsqueeze(0)
-        return xo
-
-
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def forward(self, hidden_states, residual):
+        attention_input = hidden_states + residual
 
-    def __init__(self, drop_prob=None):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
+        fused_layer_output = self.local_att(attention_input) + self.global_att(attention_input)
+        fused_layer_output = self.sigmoid(fused_layer_output)
 
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
+        output = 2 * hidden_states * fused_layer_output + 2 * residual * (1 - fused_layer_output)
+        return output
 
 
 class CLAPAudioPatchEmbed(nn.Module):
@@ -289,15 +265,16 @@ class CLAPAudioPatchEmbed(nn.Module):
 
     def __init__(self, config: CLAPAudioConfig):
         super().__init__()
-        img_size = to_2tuple(config.spec_size)
-        patch_size = to_2tuple(config.patch_size)
-        patch_stride = to_2tuple(config.patch_stride)
+        img_size = (config.spec_size, config.spec_size) if isinstance(config.spec_size, int) else config.spec_size
+        patch_size = (config.patch_size, config.patch_size) if isinstance(config.patch_size, int) else config.patch_size
+        patch_stride = (config.patch_stride, config.patch_stride) if isinstance(config.patch_stride, int) else config.patch_stride
 
         self.img_size = img_size
         self.patch_stride = patch_stride
 
         self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1])
         self.num_patches = self.grid_size[0] * self.grid_size[1]
+
         self.flatten = config.flatten_patch_embeds
         self.enable_patch_fusion = config.enable_patch_fusion
         self.enable_fusion = config.enable_fusion
@@ -315,7 +292,6 @@ def __init__(self, config: CLAPAudioConfig):
             padding=padding,
         )
 
-
         self.norm = nn.LayerNorm(config.patch_embeds_hidden_size) if config.enable_patch_layer_norm else nn.Identity()
         if self.enable_patch_fusion:
             self.fusion_model = CLAPAudioAFFBlock(config)
@@ -376,13 +352,14 @@ def __init__(self, in_features, hidden_features=None, out_features=None, config=
         self.fc2 = nn.Linear(hidden_features, out_features)
         self.drop = nn.Dropout(config.swin_drop_rate)
 
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.drop(hidden_states)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.drop(hidden_states)
+        return hidden_states
 
 
 def _no_grad_trunc_normal_(tensor, mean, std, a, b):
@@ -493,7 +470,7 @@ def __init__(self, config, hidden_dim, num_heads):
 
         super().__init__()
         self.hidden_dim = hidden_dim
-        self.window_size = to_2tuple(config.window_size)  # Wh, Ww
+        self.window_size = (config.window_size, config.window_size) if isinstance(config.window_size, int) else config.window_size
         self.num_heads = num_heads
         head_dim = self.hidden_dim // num_heads
         self.scale = head_dim**-0.5
@@ -593,7 +570,7 @@ def __init__(
             num_heads=self.num_heads,
         )
 
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.drop_path = CLAPDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
         if self.norm_before_mlp == "ln":
             self.norm2 = nn.LayerNorm(self.hidden_dim)
         elif self.norm_before_mlp == "bn":
@@ -1118,24 +1095,6 @@ def __init__(self, config, position_embedding_type=None):
         self.output = CLAPTextSelfOutput(config)
         self.pruned_heads = set()
 
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
-        )
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -1471,13 +1430,6 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
 
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
 
     # Copied from transformers.models.bert.modeling_bert.BertModel.forward
     def forward(
@@ -2071,7 +2023,6 @@ def extra_repr(self):
 class CLAPSwinTransformer(nn.Module):
     def __init__(self, config: CLAPAudioConfig):
         super(CLAPSwinTransformer, self).__init__()
-
         self.config = config
         self.spec_size = config.spec_size
         self.patch_stride = config.patch_stride
@@ -2138,86 +2089,86 @@ def __init__(self, config: CLAPAudioConfig):
         )
         self.head = nn.Linear(self.num_classes, self.num_classes)
 
-        self.spectrogram_extractor = Spectrogram(
-            n_fft=config.spectrogram_window_size,
-            hop_length=config.hop_size,
-            win_length=config.spectrogram_window_size,
-            window=config.spectrogram_window,
-            center=config.spectrogram_center,
-            pad_mode=config.spectrogram_pad_mode,
-            freeze_parameters=config.spectrogram_freeze_parameters,
-        )
-        # Logmel feature extractor
-        self.logmel_extractor = LogmelFilterBank(
-            sr=config.sample_rate,
-            n_fft=config.spectrogram_window_size,
-            n_mels=config.mel_bins,
-            fmin=config.fmin,
-            fmax=config.fmax,
-            ref=config.spectrogram_ref,
-            amin=config.spectrogram_amin,
-            top_db=config.spectrogram_top_db,
-            freeze_parameters=config.spectrogram_freeze_parameters,
-        )
-        # Spec augmenter
-        self.spec_augmenter = SpecAugmentation(
-            time_drop_width=config.spectrogram_time_drop_width,
-            time_stripes_num=config.spectrogram_time_stripes_num,
-            freq_drop_width=config.spectrogram_freq_drop_width,
-            freq_stripes_num=config.spectrogram_freq_stripes_num,
-        )
+        # self.spectrogram_extractor = Spectrogram(
+        #     n_fft=config.spectrogram_window_size,
+        #     hop_length=config.hop_size,
+        #     win_length=config.spectrogram_window_size,
+        #     window=config.spectrogram_window,
+        #     center=config.spectrogram_center,
+        #     pad_mode=config.spectrogram_pad_mode,
+        #     freeze_parameters=config.spectrogram_freeze_parameters,
+        # )
+        # # Logmel feature extractor
+        # self.logmel_extractor = LogmelFilterBank(
+        #     sr=config.sample_rate,
+        #     n_fft=config.spectrogram_window_size,
+        #     n_mels=config.mel_bins,
+        #     fmin=config.fmin,
+        #     fmax=config.fmax,
+        #     ref=config.spectrogram_ref,
+        #     amin=config.spectrogram_amin,
+        #     top_db=config.spectrogram_top_db,
+        #     freeze_parameters=config.spectrogram_freeze_parameters,
+        # )
+        # # Spec augmenter
+        # self.spec_augmenter = SpecAugmentation(
+        #     time_drop_width=config.spectrogram_time_drop_width,
+        #     time_stripes_num=config.spectrogram_time_stripes_num,
+        #     freq_drop_width=config.spectrogram_freq_drop_width,
+        #     freq_stripes_num=config.spectrogram_freq_stripes_num,
+        # )
+
+    def _forward_features(
+        self, 
+        hidden_states, 
+        longer_idx=None
+    ):
+        _, _, frames_num, _  = hidden_states.shape
 
-    def _init_weights(self, m):
-        pass
-        # if isinstance(m, nn.Linear):
-        #     trunc_normal_(m.weight, std=.02)
-        #     if isinstance(m, nn.Linear) and m.bias is not None:
-        #         nn.init.constant_(m.bias, 0)
-        # elif isinstance(m, nn.LayerNorm):
-        #     nn.init.constant_(m.bias, 0)
-        #     nn.init.constant_(m.weight, 1.0)
-
-    def forward_features(self, x, longer_idx=None):
-        # A deprecated optimization for using a hierarchical output from different blocks
-
-        frames_num = x.shape[2]
-        x = self.patch_embed(x, longer_idx=longer_idx)
+        hidden_states = self.patch_embed(hidden_states, longer_idx=longer_idx)
+        
         if self.use_absolute_pos_embedding:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
+            hidden_states = hidden_states + self.absolute_pos_embed
+        
+        hidden_states = self.pos_drop(hidden_states)
+        
         for i, layer in enumerate(self.layers):
-            x, attn = layer(x)
-        # for x
-        x = self.norm(x)
-        B, N, C = x.shape
-        SF = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
-        ST = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
-        x = x.permute(0, 2, 1).contiguous().reshape(B, C, SF, ST)
-        B, C, F, T = x.shape
+            hidden_states, _ = layer(hidden_states)
+        
+        hidden_states = self.norm(hidden_states)
+        
+        batch_size, _, n_channels = hidden_states.shape
+
+        freq_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
+        temporal_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
+        
+        hidden_states = hidden_states.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
+
+        batch_size, n_channels, n_frequencies, n_temp = hidden_states.shape
         # group 2D CNN
-        c_freq_bin = F // self.freq_ratio
-        x = x.reshape(B, C, F // c_freq_bin, c_freq_bin, T)
-        x = x.permute(0, 1, 3, 2, 4).contiguous().reshape(B, C, c_freq_bin, -1)
+        c_freq_bin = n_frequencies // self.freq_ratio
+        hidden_states = hidden_states.reshape(batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp)
+        hidden_states = hidden_states.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
         # get latent_output
-        fine_grained_latent_output = torch.mean(x, dim=2)
+        fine_grained_latent_output = torch.mean(hidden_states, dim=2)
         fine_grained_latent_output = interpolate(
             fine_grained_latent_output.permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
         )
 
-        latent_output = self.avgpool(torch.flatten(x, 2))
+        latent_output = self.avgpool(torch.flatten(hidden_states, 2))
         latent_output = torch.flatten(latent_output, 1)
 
         # display the attention map, if needed
 
-        x = self.tscam_conv(x)
-        x = torch.flatten(x, 2)  # B, C, T
+        hidden_states = self.tscam_conv(hidden_states)
+        hidden_states = torch.flatten(hidden_states, 2)  # B, C, T
 
-        fpx = interpolate(torch.sigmoid(x).permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1])
+        framewise_output = interpolate(torch.sigmoid(hidden_states).permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1])
 
-        x = self.avgpool(x)
-        x = torch.flatten(x, 1)
+        hidden_states = self.avgpool(hidden_states)
+        hidden_states = torch.flatten(hidden_states, 1)
 
-        return (fpx, torch.sigmoid(x), fine_grained_latent_output, latent_output)
+        return (framewise_output, torch.sigmoid(hidden_states), fine_grained_latent_output, latent_output)
 
 
     def crop_wav(self, x, crop_size, spe_pos=None):
@@ -2233,7 +2184,7 @@ def crop_wav(self, x, crop_size, spe_pos=None):
 
     # Reshape the wavform to a img size, if you want to use the pretrained swin transformer model
     def reshape_wav2img(self, x):
-        B, C, T, F = x.shape
+        _, _, T, F = x.shape
         target_T = int(self.spec_size * self.freq_ratio)
         target_F = self.spec_size // self.freq_ratio
         assert T <= target_T and F <= target_F, "the wav size should less than or equal to the swin input size"
@@ -2257,7 +2208,8 @@ def forward(
         mixup_lambda=None, 
         device=None,
         return_dict=False,
-    ):  # out_feat_keys: List[str] = None):
+    ):
+        # TODO: remove this
         mel_fusion = mel_fusion[None, :].to(0)
         waveform = waveform[None, :].to(0)
 
@@ -2267,10 +2219,12 @@ def forward(
             longer[torch.randint(0, longer.shape[0], (1,))] = True
 
         
-        x = mel_fusion.to(device=device, non_blocking=True)
-        x = x.transpose(1, 3)
-        x = self.bn0(x)
-        x = x.transpose(1, 3)
+        # TODO: remove .to(device)
+        mel_fusion = mel_fusion.to(device=device, non_blocking=True)
+
+        mel_fusion = mel_fusion.transpose(1, 3)
+        hidden_states = self.bn0(mel_fusion)
+        hidden_states = hidden_states.transpose(1, 3)
         
         longer_list_idx = None
         if self.enable_fusion:
@@ -2278,12 +2232,16 @@ def forward(
             longer_list_idx = torch.where(longer_list)[0]
             
         if self.training:
-            x = self.spec_augmenter(x)
-            if mixup_lambda is not None:
-                x = do_mixup(x, mixup_lambda)
+            raise ValueError(
+                "CLAP does not support training since we need to enable `SpectrogramAugmentation`",
+                " this will be addressed in a future release."
+            )
+            # x = self.spec_augmenter(x)
+            # if mixup_lambda is not None:
+            #     x = do_mixup(x, mixup_lambda)
 
-        x = self.reshape_wav2img(x)
-        output = self.forward_features(x, longer_idx=longer_list_idx)
+        hidden_states = self.reshape_wav2img(hidden_states)
+        output = self._forward_features(hidden_states, longer_idx=longer_list_idx)
 
         if not return_dict:
             return output
@@ -2295,5 +2253,4 @@ def forward(
             clipwise_output=clipwise_output,
             fine_grained_embedding=fine_grained_embedding,
             embedding=output_embeddingss,
-        )
-
+        )
\ No newline at end of file

From 95c400b5b2c14bf0de981d5fd984b0a3ad84bd44 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 13:27:26 +0000
Subject: [PATCH 020/197] forward passes

---
 .../feature_extraction_sequence_utils.py      |  8 +--
 .../models/clap/feature_extraction_clap.py    | 59 +++++++++++--------
 .../models/clap/processing_clap.py            |  1 -
 3 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 44906fb72cd4..10da21c6f2fe 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -618,7 +618,7 @@ def _fram_wave(self, waveform: np.array, center: bool = True):
             framed_waveform (`np.array` of shape (waveform.shape // self.hop_length , self.n_fft)):
                 The framed waveforms that can be fed `np.fft`.
         """
-
+        # TODO: test if stereo audio works???
         frames = []
         for i in range(0, waveform.shape[0] + 1, self.hop_length):
             half_window = (self.n_fft - 1) // 2 + 1
@@ -641,8 +641,8 @@ def _fram_wave(self, waveform: np.array, center: bool = True):
                     frame = np.lib.pad(
                         frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0
                     )
-
             frames.append(frame)
-        framed_waveform = np.stack(frames, 0)
 
-        return framed_waveform
+        frames = np.stack(frames, 0)
+
+        return frames
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 67711bd0e7b7..4957ab41d4d1 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -53,7 +53,7 @@ class CLAPFeatureExtractor(SequenceFeatureExtractor):
             Padding value used to pad the audio. Should correspond to silences.
     """
 
-    model_input_names = ["input_features"]
+    model_input_names = ["input_features", "is_longer"]
 
     def __init__(
         self,
@@ -69,6 +69,8 @@ def __init__(
         f_max: float = 14000,
         top_db: int = None,
         max_length: int = 48000,
+        truncation:str = "fusion",
+        padding:str = "repeatpad",
         **kwargs
     ):
         super().__init__(
@@ -110,21 +112,25 @@ def __init__(
 
 
-    def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array]) -> np.ndarray:
+    def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
         """
         Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
         implementation with 1e-5 tolerance.
         """
         window = np.hanning(self.n_fft + 1)[:-1]
 
+        # TODO why don't we take the last value?
+        # window = np.hanning(self.n_fft + 1)[:-1]
+        
+        
         frames = self._fram_wave(waveform)
         stft = self._stft(frames, window=window)
 
         # if the imaginary parts are taken : (real, imag) = stftl; real ** 2 + imag ** 2
         magnitudes = np.abs(stft) ** 2
-        mel_spec = np.matmul(magnitudes, self.mel_filters)
-
-        return self._power_to_db(mel_spec)
+        mel_spec = np.matmul(mel_filters.T, magnitudes)
+        log_mel_spec = self._power_to_db(mel_spec)
+        return log_mel_spec.T
 
     @staticmethod
     # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
@@ -166,16 +172,18 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]
         mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
 
-        # shrink the mel TODO add this as a numpy function
-        mel_shrink = torchvision.transforms.Resize(size=[chunk_frames, 64])(mel[None])[0]
+        # shrink the mel TODO add this as a numpy function, also no hard codes `64`
+        mel_shrink = np.resize(mel, [chunk_frames, 64]) # current flags are probalby wrong
+        import torch
+        mel_shrink = torchvision.transforms.Resize(size=[chunk_frames, 64])(torch.tensor(mel[None]))[0]
         # logging.info(f"mel_shrink.shape: {mel_shrink.shape}")
 
         # stack
-        mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink], dim=0)
+        mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink], axis=0)
         return mel_fusion
 
     def _get_audio_features(
-        self, waveform: np.array, max_length, padding, pad_to_multiple_of, truncation
+        self, waveform: np.array, max_length, truncation, padding, pad_to_multiple_of
     ) -> np.array:
         """
         Possible cases :
@@ -187,15 +195,15 @@ def _get_audio_features(
                 - fusion
 
         """
-        if len(waveform) > max_length:
+        if waveform.shape[0] > max_length:
             if truncation == "rand_trunc":
                 longer = True
             elif truncation == "fusion":
-                mel = self._np_extract_fbank_features(waveform)
-                chunk_frames = max_length // self.hop_size + 1  # the +1 related to how the spectrogram is computed
+                mel = self._np_extract_fbank_features(waveform, self.mel_filters)
+                chunk_frames = max_length // self.hop_length + 1  # the +1 related to how the spectrogram is computed
                 total_frames = mel.shape[0]
                 if chunk_frames == total_frames:
-                    # there is a corner case where the audio length is larger than max_length but smaller than max_length+hop_size.
+                    # there is a corner case where the audio length is larger than max_length but smaller than max_length+hop_length.
                     # In this case, we just use the whole audio.
                     input_mel = np.stack([mel, mel, mel, mel], dim=0)
                     longer = False
@@ -212,7 +220,7 @@ def _get_audio_features(
         else:
             longer = False
             # only use repeat as a new possible value for padding. you repeat the audio before applying the usual max_length padding
-            if len(waveform) < max_length and padding == "repeatpad":  # do nothing if equal
+            if waveform.shape[0] < max_length and padding == "repeatpad":  # do nothing if equal
                 n_repeat = int(max_length / len(waveform))
                 waveform = waveform.repeat(n_repeat)
             else:
@@ -233,11 +241,11 @@ def _get_audio_features(
     def __call__(
         self,
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
-        truncation: bool = True,
+        truncation: str = "fusion",
         pad_to_multiple_of: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_attention_mask: Optional[bool] = None,
-        padding: Optional[str] = "max_length",
+        padding: Optional[str] = "repeatpad",
         max_length: Optional[int] = None,
         sampling_rate: Optional[int] = None,
         **kwargs
@@ -302,7 +310,7 @@ def __call__(
         )
 
         if is_batched:
-            raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
+            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
         elif not is_batched and not isinstance(raw_speech, np.ndarray):
             raw_speech = np.asarray(raw_speech, dtype=np.float32)
         elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
@@ -310,16 +318,16 @@ def __call__(
 
         # always return batch
         if not is_batched:
-            raw_speech = [np.asarray([raw_speech]).T]
+            raw_speech = [np.asarray(raw_speech)]
 
         # convert into correct format for padding
         padded_inputs = [
             self._get_audio_features(
                 waveform,
+                max_length if max_length else self.max_length,
                 truncation,
-                pad_to_multiple_of,
                 padding,
-                max_length if max_length else self.max_length,
+                pad_to_multiple_of,
             )
             for waveform in raw_speech
         ]
@@ -330,7 +338,7 @@ def __call__(
             input_mel.append(mel)
             is_longer.append(longer)
 
-        if self.enable_fusion and is_longer.sum() == 0:
+        if truncation == "fusion" and sum(is_longer) == 0:
             # if no audio is longer than 10s, then randomly select one audio to be longer
             rand_idx = np.random.randint(0, len(input_mel))
             is_longer[rand_idx] = True
@@ -338,9 +346,10 @@ def __call__(
         if isinstance(input_mel[0], List):
             input_mel = [np.asarray(mel, dtype=np.float32) for feature in input_mel]
 
+        input_features = {"input_features": input_mel, "is_longer": is_longer}
+        input_features = BatchFeature(input_features)
+
         if return_tensors is not None:
-            input_mel = input_mel.convert_to_tensors(return_tensors)
-            is_longer = is_longer.convert_to_tensors(return_tensors)
+            input_features = input_features.convert_to_tensors(return_tensors)
 
-        input_features = {"input_features": input_mel, "is_longer": is_longer}
-        return input_features
+        return input_features
\ No newline at end of file
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index 12a0d72dd279..cf6eb4725cf2 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -35,7 +35,6 @@ class CLAPProcessor(ProcessorMixin):
         tokenizer ([`CLAPTokenizerFast`]):
             The tokenizer is a required input.
     """
-    attributes = ["feature_extractor", "tokenizer"]
     feature_extractor_class = "CLAPFeatureExtractor"
     tokenizer_class = ("CLAPTokenizer", "CLAPTokenizerFast")
 

From a41ff1af00bf90b9c1ad68c87ee99b052e0ddc26 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 13:27:55 +0000
Subject: [PATCH 021/197] fixup

---
 .../feature_extraction_sequence_utils.py      | 14 +++++------
 .../models/clap/feature_extraction_clap.py    | 18 ++++++---------
 src/transformers/models/clap/modeling_clap.py | 23 ++++++++++---------
 3 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 10da21c6f2fe..547c310f252a 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -500,15 +500,15 @@ def get_mel_filter_banks(
                 Number of frequencies to highlight/apply
             f_min (float):
                 Minimum frequency (Hz)
-            f_max (float): 
+            f_max (float):
                 Maximum frequency (Hz)
-            n_mels (int): 
+            n_mels (int):
                 Number of mel filterbanks
-            sample_rate (int): 
+            sample_rate (int):
                 Sample rate of the audio waveform
-            norm (str or None, optional): 
+            norm (str or None, optional):
                 If "slaney", divide the triangular mel weights by the width of the mel band (area normalization). (Default: ``None``)
-            mel_scale (str, optional): 
+            mel_scale (str, optional):
                 Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
 
         Returns:
@@ -558,8 +558,8 @@ def _stft(self, frames, window):
                 A framed audio signal obtained using `self._fram_wav`.
             window (`np.array` of dimension `(self.n_freqs, self.n_mels)`:
                 A array reprensenting the function that will be used to reduces the amplitude of the
-                discontinuities at the boundaries of each frame when computing the FFT. Each frame will 
-                be multiplied by the window. For more information on this phenomena, called *Spectral leakage*, 
+                discontinuities at the boundaries of each frame when computing the FFT. Each frame will
+                be multiplied by the window. For more information on this phenomena, called *Spectral leakage*,
                 refer to [this tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
         """
         frame_size = frames.shape[1]
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 4957ab41d4d1..fffce5c93efa 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -69,8 +69,8 @@ def __init__(
         f_max: float = 14000,
         top_db: int = None,
         max_length: int = 48000,
-        truncation:str = "fusion",
-        padding:str = "repeatpad",
+        truncation: str = "fusion",
+        padding: str = "repeatpad",
         **kwargs
     ):
         super().__init__(
@@ -110,8 +110,6 @@ def __init__(
         )
         self.top_db = top_db
 
-
-
     def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
         """
         Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
@@ -121,8 +119,7 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[n
 
         # TODO why don't we take the last value?
         # window = np.hanning(self.n_fft + 1)[:-1]
-        
-        
+
         frames = self._fram_wave(waveform)
         stft = self._stft(frames, window=window)
 
@@ -173,8 +170,9 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
 
         # shrink the mel TODO add this as a numpy function, also no hard codes `64`
-        mel_shrink = np.resize(mel, [chunk_frames, 64]) # current flags are probalby wrong
+        mel_shrink = np.resize(mel, [chunk_frames, 64])  # current flags are probalby wrong
         import torch
+
         mel_shrink = torchvision.transforms.Resize(size=[chunk_frames, 64])(torch.tensor(mel[None]))[0]
         # logging.info(f"mel_shrink.shape: {mel_shrink.shape}")
 
@@ -182,9 +180,7 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink], axis=0)
         return mel_fusion
 
-    def _get_audio_features(
-        self, waveform: np.array, max_length, truncation, padding, pad_to_multiple_of
-    ) -> np.array:
+    def _get_audio_features(self, waveform: np.array, max_length, truncation, padding, pad_to_multiple_of) -> np.array:
         """
         Possible cases :
             - wave > max_length
@@ -352,4 +348,4 @@ def __call__(
         if return_tensors is not None:
             input_features = input_features.convert_to_tensors(return_tensors)
 
-        return input_features
\ No newline at end of file
+        return input_features
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 33e193447037..c9108738849b 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -150,6 +150,7 @@ class CLAPAudioModelOutput(ModelOutput):
         embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
     """
+
     framewise_output: torch.FloatTensor = None
     clipwise_output: torch.FloatTensor = None
     fine_grained_embedding: torch.FloatTensor = None
@@ -315,12 +316,16 @@ def __init__(self, config: CLAPAudioConfig):
             padding=padding,
         )
 
-
         self.norm = nn.LayerNorm(config.patch_embeds_hidden_size) if config.enable_patch_layer_norm else nn.Identity()
         if self.enable_patch_fusion:
             self.fusion_model = CLAPAudioAFFBlock(config)
-            self.mel_conv2d = nn.Conv2d(config.patch_embed_input_channels, config.patch_embeds_hidden_size, kernel_size=(patch_size[0], patch_size[1]*3), stride=(patch_stride[0], patch_stride[1] * 3), padding=padding)
-
+            self.mel_conv2d = nn.Conv2d(
+                config.patch_embed_input_channels,
+                config.patch_embeds_hidden_size,
+                kernel_size=(patch_size[0], patch_size[1] * 3),
+                stride=(patch_stride[0], patch_stride[1] * 3),
+                padding=padding,
+            )
 
     def forward(self, x, longer_idx=None):
         if self.enable_fusion:
@@ -2219,7 +2224,6 @@ def forward_features(self, x, longer_idx=None):
 
         return (fpx, torch.sigmoid(x), fine_grained_latent_output, latent_output)
 
-
     def crop_wav(self, x, crop_size, spe_pos=None):
         time_steps = x.shape[2]
         tx = torch.zeros(x.shape[0], x.shape[1], crop_size, x.shape[3]).to(x.device)
@@ -2250,33 +2254,31 @@ def reshape_wav2img(self, x):
         return x
 
     def forward(
-        self, 
+        self,
         mel_fusion=None,
         longer=None,
         waveform=None,
-        mixup_lambda=None, 
+        mixup_lambda=None,
         device=None,
         return_dict=False,
     ):  # out_feat_keys: List[str] = None):
         mel_fusion = mel_fusion[None, :].to(0)
         waveform = waveform[None, :].to(0)
 
-
         if self.enable_fusion and longer.sum() == 0:
             # if no audio is longer than 10s, then randomly select one audio to be longer
             longer[torch.randint(0, longer.shape[0], (1,))] = True
 
-        
         x = mel_fusion.to(device=device, non_blocking=True)
         x = x.transpose(1, 3)
         x = self.bn0(x)
         x = x.transpose(1, 3)
-        
+
         longer_list_idx = None
         if self.enable_fusion:
             longer_list = longer.to(device=device, non_blocking=True)
             longer_list_idx = torch.where(longer_list)[0]
-            
+
         if self.training:
             x = self.spec_augmenter(x)
             if mixup_lambda is not None:
@@ -2296,4 +2298,3 @@ def forward(
             fine_grained_embedding=fine_grained_embedding,
             embedding=output_embeddingss,
         )
-

From 275633c421a2026ebe3d47f1cfc99d7f9a36582b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 13:30:18 +0000
Subject: [PATCH 022/197] fixup

---
 .../feature_extraction_sequence_utils.py      | 27 +++----
 src/transformers/models/clap/modeling_clap.py | 75 ++++++++++---------
 2 files changed, 53 insertions(+), 49 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 547c310f252a..7bd9cc7a5c20 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -474,12 +474,12 @@ def get_mel_filter_banks(
         mel_scale: str = "htk",
     ) -> np.array:
         r"""
-        Create a frequency bin conversion matrix used to obtain the Mel Frequency Cepstral Coefficient.
-        This is called a `mel filter bank`, and various implementation exist, which differ in the number of filters,
-        the shape of the filters, the way the filters are spaced, the bandwidth of the filters, and the manner in which
-        the spectrum is warped. The goal of these features is to approximate the non-linear human perception of the
-        variation in pitch with respect to the frequency. This code is heavily inspired from the `torchaudio`
-        implementation, refer to XXX for more details.
+        Create a frequency bin conversion matrix used to obtain the Mel Frequency Cepstral Coefficient. This is called
+        a `mel filter bank`, and various implementation exist, which differ in the number of filters, the shape of the
+        filters, the way the filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is
+        warped. The goal of these features is to approximate the non-linear human perception of the variation in pitch
+        with respect to the frequency. This code is heavily inspired from the `torchaudio` implementation, refer to XXX
+        for more details.
 
 
         Note:
@@ -507,7 +507,8 @@ def get_mel_filter_banks(
             sample_rate (int):
                 Sample rate of the audio waveform
             norm (str or None, optional):
-                If "slaney", divide the triangular mel weights by the width of the mel band (area normalization). (Default: ``None``)
+                If "slaney", divide the triangular mel weights by the width of the mel band (area normalization).
+                (Default: ``None``)
             mel_scale (str, optional):
                 Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
 
@@ -550,17 +551,17 @@ def get_mel_filter_banks(
 
     def _stft(self, frames, window):
         """
-        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the
-        same results as `torch.stft`.
+        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
+        results as `torch.stft`.
 
         Args:
             frames (`np.array` of dimension `(num_frames, self.n_fft)`):
                 A framed audio signal obtained using `self._fram_wav`.
             window (`np.array` of dimension `(self.n_freqs, self.n_mels)`:
-                A array reprensenting the function that will be used to reduces the amplitude of the
-                discontinuities at the boundaries of each frame when computing the FFT. Each frame will
-                be multiplied by the window. For more information on this phenomena, called *Spectral leakage*,
-                refer to [this tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
+                A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at
+                the boundaries of each frame when computing the FFT. Each frame will be multiplied by the window. For
+                more information on this phenomena, called *Spectral leakage*, refer to [this
+                tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
         """
         frame_size = frames.shape[1]
         fft_size = self.n_fft
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 5c6851686f2a..0c52b3c1851e 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -13,11 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch CLAP model."""
-import collections
 import math
 import random
 from dataclasses import dataclass
-from itertools import repeat
 from typing import Any, List, Optional, Tuple, Union
 
 import numpy as np
@@ -27,9 +25,6 @@
 from torch import nn
 from torch.nn.init import _calculate_fan_in_and_fan_out
 
-from torchlibrosa.augmentation import SpecAugmentation
-from torchlibrosa.stft import LogmelFilterBank, Spectrogram
-
 from ...activations import ACT2FN
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
@@ -197,12 +192,13 @@ def to_tuple(self) -> Tuple[Any]:
 
 class CLAPDropPath(nn.Module):
     """
-    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, the original name is
-    misleading as 'Drop Connect' is a different form of dropout in a separate paper... See discussion:
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the
+    DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop
+    Connect' is a different form of dropout in a separate paper... See discussion:
     https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the layer and
     argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the argument.
     """
+
     def __init__(self, drop_prob=None):
         super(CLAPDropPath, self).__init__()
         self.drop_prob = drop_prob
@@ -210,23 +206,24 @@ def __init__(self, drop_prob=None):
     def forward(self, hidden_states):
         if self.drop_prob == 0.0 or not self.training:
             return hidden_states
-        
+
         keep_prob = 1 - self.drop_prob
-        shape = (hidden_states.shape[0],) + (1,) * (hidden_states.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-        
+        shape = (hidden_states.shape[0],) + (1,) * (
+            hidden_states.ndim - 1
+        )  # work with diff dim tensors, not just 2D ConvNets
+
         random_tensor = keep_prob + torch.rand(shape, dtype=hidden_states.dtype, device=hidden_states.device)
         random_tensor.floor_()  # binarize
         output = hidden_states.div(keep_prob) * random_tensor
         return output
 
 
-
 # Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133
 class CLAPAudioAFFBlock(nn.Module):
     r"""
-    AFF Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to
-    implement the 1D version.
+    AFF Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement the 1D version.
     """
+
     def __init__(self, config: CLAPAudioConfig):
         super(CLAPAudioAFFBlock, self).__init__()
         channels = config.patch_embeds_hidden_size
@@ -267,8 +264,12 @@ class CLAPAudioPatchEmbed(nn.Module):
     def __init__(self, config: CLAPAudioConfig):
         super().__init__()
         img_size = (config.spec_size, config.spec_size) if isinstance(config.spec_size, int) else config.spec_size
-        patch_size = (config.patch_size, config.patch_size) if isinstance(config.patch_size, int) else config.patch_size
-        patch_stride = (config.patch_stride, config.patch_stride) if isinstance(config.patch_stride, int) else config.patch_stride
+        patch_size = (
+            (config.patch_size, config.patch_size) if isinstance(config.patch_size, int) else config.patch_size
+        )
+        patch_stride = (
+            (config.patch_stride, config.patch_stride) if isinstance(config.patch_stride, int) else config.patch_stride
+        )
 
         self.img_size = img_size
         self.patch_stride = patch_stride
@@ -476,7 +477,9 @@ def __init__(self, config, hidden_dim, num_heads):
 
         super().__init__()
         self.hidden_dim = hidden_dim
-        self.window_size = (config.window_size, config.window_size) if isinstance(config.window_size, int) else config.window_size
+        self.window_size = (
+            (config.window_size, config.window_size) if isinstance(config.window_size, int) else config.window_size
+        )
         self.num_heads = num_heads
         head_dim = self.hidden_dim // num_heads
         self.scale = head_dim**-0.5
@@ -1436,7 +1439,6 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
 
-
     # Copied from transformers.models.bert.modeling_bert.BertModel.forward
     def forward(
         self,
@@ -2124,37 +2126,37 @@ def __init__(self, config: CLAPAudioConfig):
         #     freq_stripes_num=config.spectrogram_freq_stripes_num,
         # )
 
-    def _forward_features(
-        self, 
-        hidden_states, 
-        longer_idx=None
-    ):
-        _, _, frames_num, _  = hidden_states.shape
+    def _forward_features(self, hidden_states, longer_idx=None):
+        _, _, frames_num, _ = hidden_states.shape
 
         hidden_states = self.patch_embed(hidden_states, longer_idx=longer_idx)
-        
+
         if self.use_absolute_pos_embedding:
             hidden_states = hidden_states + self.absolute_pos_embed
-        
+
         hidden_states = self.pos_drop(hidden_states)
-        
+
         for i, layer in enumerate(self.layers):
             hidden_states, _ = layer(hidden_states)
-        
+
         hidden_states = self.norm(hidden_states)
-        
+
         batch_size, _, n_channels = hidden_states.shape
 
         freq_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
         temporal_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
-        
-        hidden_states = hidden_states.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
+
+        hidden_states = (
+            hidden_states.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
+        )
 
         batch_size, n_channels, n_frequencies, n_temp = hidden_states.shape
         # group 2D CNN
         c_freq_bin = n_frequencies // self.freq_ratio
         hidden_states = hidden_states.reshape(batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp)
-        hidden_states = hidden_states.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
+        hidden_states = (
+            hidden_states.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
+        )
         # get latent_output
         fine_grained_latent_output = torch.mean(hidden_states, dim=2)
         fine_grained_latent_output = interpolate(
@@ -2169,7 +2171,9 @@ def _forward_features(
         hidden_states = self.tscam_conv(hidden_states)
         hidden_states = torch.flatten(hidden_states, 2)  # B, C, T
 
-        framewise_output = interpolate(torch.sigmoid(hidden_states).permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1])
+        framewise_output = interpolate(
+            torch.sigmoid(hidden_states).permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
+        )
 
         hidden_states = self.avgpool(hidden_states)
         hidden_states = torch.flatten(hidden_states, 1)
@@ -2222,14 +2226,13 @@ def forward(
             # if no audio is longer than 10s, then randomly select one audio to be longer
             longer[torch.randint(0, longer.shape[0], (1,))] = True
 
-        
         # TODO: remove .to(device)
         mel_fusion = mel_fusion.to(device=device, non_blocking=True)
 
         mel_fusion = mel_fusion.transpose(1, 3)
         hidden_states = self.bn0(mel_fusion)
         hidden_states = hidden_states.transpose(1, 3)
-        
+
         longer_list_idx = None
         if self.enable_fusion:
             longer_list = longer.to(device=device, non_blocking=True)
@@ -2238,7 +2241,7 @@ def forward(
         if self.training:
             raise ValueError(
                 "CLAP does not support training since we need to enable `SpectrogramAugmentation`",
-                " this will be addressed in a future release."
+                " this will be addressed in a future release.",
             )
             # x = self.spec_augmenter(x)
             # if mixup_lambda is not None:

From 1b3a8201928c5b707c6438ffb18806a3dc6d87e0 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 2 Feb 2023 14:43:44 +0000
Subject: [PATCH 023/197] some clean up

---
 src/transformers/models/clap/modeling_clap.py | 63 ++++++++++---------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 0c52b3c1851e..24a6a9a95de9 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1732,7 +1732,7 @@ def get_text_features(
     @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
     def get_audio_features(
         self,
-        mel_fusion: Optional[torch.Tensor] = None,
+        input_features: Optional[torch.Tensor] = None,
         longer: Optional[torch.Tensor] = None,
         waveform: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
@@ -1748,7 +1748,7 @@ def get_audio_features(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         audio_outputs = self.audio_model(
-            mel_fusion=mel_fusion,
+            input_features=input_features,
             longer=longer,
             waveform=waveform,
             return_dict=return_dict,
@@ -2180,9 +2180,9 @@ def _forward_features(self, hidden_states, longer_idx=None):
 
         return (framewise_output, torch.sigmoid(hidden_states), fine_grained_latent_output, latent_output)
 
-    def crop_wav(self, x, crop_size, spe_pos=None):
-        time_steps = x.shape[2]
-        tx = torch.zeros(x.shape[0], x.shape[1], crop_size, x.shape[3]).to(x.device)
+    def crop_wav(self, hidden_states, crop_size, spe_pos=None):
+        time_steps = hidden_states.shape[2]
+        tx = torch.zeros(hidden_states.shape[0], hidden_states.shape[1], crop_size, hidden_states.shape[3]).to(hidden_states.device)
         for i in range(len(x)):
             if spe_pos is None:
                 crop_pos = random.randint(0, time_steps - crop_size - 1)
@@ -2192,50 +2192,51 @@ def crop_wav(self, x, crop_size, spe_pos=None):
         return tx
 
     # Reshape the wavform to a img size, if you want to use the pretrained swin transformer model
-    def reshape_wav2img(self, x):
-        _, _, T, F = x.shape
+    def reshape_wav2img(self, hidden_states):
+        _, _, time_steps, freq_steps = hidden_states.shape
+
         target_T = int(self.spec_size * self.freq_ratio)
         target_F = self.spec_size // self.freq_ratio
-        assert T <= target_T and F <= target_F, "the wav size should less than or equal to the swin input size"
+        
+        if time_steps > target_T or freq_steps > target_F:
+            raise ValueError(
+                "the wav size should less than or equal to the swin input size"
+            )
+        
         # to avoid bicubic zero error
-        if T < target_T:
-            x = nn.functional.interpolate(x, (target_T, x.shape[3]), mode="bicubic", align_corners=True)
-        if F < target_F:
-            x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)
-        x = x.permute(0, 1, 3, 2).contiguous()
-        x = x.reshape(x.shape[0], x.shape[1], x.shape[2], self.freq_ratio, x.shape[3] // self.freq_ratio)
-        # print(x.shape)
-        x = x.permute(0, 1, 3, 2, 4).contiguous()
-        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3], x.shape[4])
-        return x
+        if time_steps < target_T:
+            hidden_states = nn.functional.interpolate(hidden_states, (target_T, hidden_states.shape[3]), mode="bicubic", align_corners=True)
+        if freq_steps < target_F:
+            hidden_states = nn.functional.interpolate(hidden_states, (hidden_states.shape[2], target_F), mode="bicubic", align_corners=True)
+    
+        # hidden_states = hidden_states.contiguous().view(hidden_states.shape[0], hidden_states.shape[1], hidden_states.shape[-1] * self.freq_ratio, hidden_states.shape[2] // self.freq_ratio)
+
+        hidden_states = hidden_states.permute(0, 1, 3, 2).contiguous()
+        hidden_states = hidden_states.reshape(hidden_states.shape[0], hidden_states.shape[1], hidden_states.shape[2], self.freq_ratio, hidden_states.shape[3] // self.freq_ratio)
+        
+        hidden_states = hidden_states.permute(0, 1, 3, 2, 4).contiguous()
+        hidden_states = hidden_states.reshape(hidden_states.shape[0], hidden_states.shape[1], hidden_states.shape[2] * hidden_states.shape[3], hidden_states.shape[4])
+        
+        return hidden_states
 
     def forward(
         self,
-        mel_fusion=None,
+        input_features=None,
         longer=None,
         waveform=None,
-        mixup_lambda=None,
-        device=None,
         return_dict=False,
     ):
-        # TODO: remove this
-        mel_fusion = mel_fusion[None, :].to(0)
-        waveform = waveform[None, :].to(0)
-
         if self.enable_fusion and longer.sum() == 0:
             # if no audio is longer than 10s, then randomly select one audio to be longer
             longer[torch.randint(0, longer.shape[0], (1,))] = True
 
-        # TODO: remove .to(device)
-        mel_fusion = mel_fusion.to(device=device, non_blocking=True)
-
-        mel_fusion = mel_fusion.transpose(1, 3)
-        hidden_states = self.bn0(mel_fusion)
+        input_features = input_features.transpose(1, 3)
+        hidden_states = self.bn0(input_features)
         hidden_states = hidden_states.transpose(1, 3)
 
         longer_list_idx = None
         if self.enable_fusion:
-            longer_list = longer.to(device=device, non_blocking=True)
+            longer_list = longer.to(input_features.device)
             longer_list_idx = torch.where(longer_list)[0]
 
         if self.training:

From 8fed2d0d58714e76da821a084acf4febfe190493 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 15:26:36 +0000
Subject: [PATCH 024/197] remove mels form the dictionnary

---
 .../models/clap/feature_extraction_clap.py    | 48 +++++++++++++------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index fffce5c93efa..d19cb495b8d7 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Feature extractor class for CLAP."""
 
-from typing import List, Optional, Union
 
 import numpy as np
 import torchvision
@@ -23,7 +22,9 @@
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
 
-
+from ... import __version__
+from typing import Any, Dict, List, Optional, Union
+import copy 
 logger = logging.get_logger(__name__)
 
 
@@ -58,8 +59,8 @@ class CLAPFeatureExtractor(SequenceFeatureExtractor):
     def __init__(
         self,
         feature_size=80,
-        sampling_rate=16000,
-        hop_length=160,
+        sampling_rate=48_000,
+        hop_length=480,
         chunk_length=30,
         n_fft=400,
         padding_value=0.0,
@@ -68,7 +69,7 @@ def __init__(
         f_min: float = 0,
         f_max: float = 14000,
         top_db: int = None,
-        max_length: int = 48000,
+        max_length: int = 480_000,
         truncation: str = "fusion",
         padding: str = "repeatpad",
         **kwargs
@@ -110,6 +111,22 @@ def __init__(
         )
         self.top_db = top_db
 
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["feature_extractor_type"] = self.__class__.__name__
+        if "mel_filters" in output:
+            del output["mel_filters"]
+        if "mel_filters_slaney" in output:
+            del output["mel_filters_slaney"]
+        return output
+    
     def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
         """
         Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
@@ -170,10 +187,10 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
 
         # shrink the mel TODO add this as a numpy function, also no hard codes `64`
-        mel_shrink = np.resize(mel, [chunk_frames, 64])  # current flags are probalby wrong
+        mel_shrink = np.resize(mel, [chunk_frames, self.feature_size])  # current flags are probalby wrong
         import torch
 
-        mel_shrink = torchvision.transforms.Resize(size=[chunk_frames, 64])(torch.tensor(mel[None]))[0]
+        mel_shrink = torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])(torch.tensor(mel[None]))[0]
         # logging.info(f"mel_shrink.shape: {mel_shrink.shape}")
 
         # stack
@@ -189,11 +206,18 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
             - wave < max_length
                 - repeat
                 - fusion
+                
+                TODO the max length should be 10x the sampling rate of the provided audio.
 
         """
         if waveform.shape[0] > max_length:
             if truncation == "rand_trunc":
                 longer = True
+                # random crop to max_length (for compatibility) -> this should be handled by self.pad
+                overflow = len(waveform) - max_length
+                idx = np.random.randint(0, overflow + 1)
+                waveform = waveform[idx : idx + max_length]
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
             elif truncation == "fusion":
                 mel = self._np_extract_fbank_features(waveform, self.mel_filters)
                 chunk_frames = max_length // self.hop_length + 1  # the +1 related to how the spectrogram is computed
@@ -201,18 +225,14 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
                 if chunk_frames == total_frames:
                     # there is a corner case where the audio length is larger than max_length but smaller than max_length+hop_length.
                     # In this case, we just use the whole audio.
-                    input_mel = np.stack([mel, mel, mel, mel], dim=0)
+                    input_mel = np.stack([mel, mel, mel, mel], axis=0)
                     longer = False
                 else:
                     input_mel = self._random_mel_fusion(mel, total_frames, chunk_frames)
                     longer = True
-
             else:
                 raise NotImplementedError(f"data_truncating {truncation} not implemented")
-            # random crop to max_length (for compatibility) -> this should be handled by self.pad
-            overflow = len(waveform) - max_length
-            idx = np.random.randint(0, overflow + 1)
-            waveform = waveform[idx : idx + max_length]
+
         else:
             longer = False
             # only use repeat as a new possible value for padding. you repeat the audio before applying the usual max_length padding
@@ -229,7 +249,7 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
                 )
             if truncation == "fusion":
                 mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
-                input_mel = np.stack([mel, mel, mel, mel], dim=0)
+                input_mel = np.stack([mel, mel, mel, mel], axis=0)
             else:
                 input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
         return input_mel, longer

From 6b9051c37431c7397102dd042e063897d84fe618 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 16:25:18 +0000
Subject: [PATCH 025/197] pad after the repeat

---
 .../models/clap/feature_extraction_clap.py    | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index d19cb495b8d7..3fbe46e772b0 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -15,16 +15,18 @@
 """Feature extractor class for CLAP."""
 
 
+import copy
+from typing import Any, Dict, List, Optional, Union
+
 import numpy as np
 import torchvision
 
+from ... import __version__
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
 
-from ... import __version__
-from typing import Any, Dict, List, Optional, Union
-import copy 
+
 logger = logging.get_logger(__name__)
 
 
@@ -111,7 +113,6 @@ def __init__(
         )
         self.top_db = top_db
 
-
     def to_dict(self) -> Dict[str, Any]:
         """
         Serializes this instance to a Python dictionary.
@@ -126,7 +127,7 @@ def to_dict(self) -> Dict[str, Any]:
         if "mel_filters_slaney" in output:
             del output["mel_filters_slaney"]
         return output
-    
+
     def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
         """
         Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
@@ -206,7 +207,7 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
             - wave < max_length
                 - repeat
                 - fusion
-                
+
                 TODO the max length should be 10x the sampling rate of the provided audio.
 
         """
@@ -239,14 +240,14 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
             if waveform.shape[0] < max_length and padding == "repeatpad":  # do nothing if equal
                 n_repeat = int(max_length / len(waveform))
                 waveform = waveform.repeat(n_repeat)
-            else:
-                waveform = self.pad(
-                    waveform,
-                    padding=padding,
-                    max_length=max_length if max_length else self.n_samples,
-                    truncation=truncation,
-                    pad_to_multiple_of=pad_to_multiple_of,
-                )
+            
+            waveform = self.pad(
+                waveform,
+                padding=padding,
+                max_length=max_length if max_length else self.n_samples,
+                truncation=truncation,
+                pad_to_multiple_of=pad_to_multiple_of,
+            )
             if truncation == "fusion":
                 mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
                 input_mel = np.stack([mel, mel, mel, mel], axis=0)

From a468f8ddd7dfb0cf64dacc2a01c51ed4a10e4afa Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 16:29:12 +0000
Subject: [PATCH 026/197] update padding when dsmaller

---
 .../models/clap/feature_extraction_clap.py     | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 3fbe46e772b0..8d1c217a6490 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -240,19 +240,13 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
             if waveform.shape[0] < max_length and padding == "repeatpad":  # do nothing if equal
                 n_repeat = int(max_length / len(waveform))
                 waveform = waveform.repeat(n_repeat)
-            
-            waveform = self.pad(
-                waveform,
-                padding=padding,
-                max_length=max_length if max_length else self.n_samples,
-                truncation=truncation,
-                pad_to_multiple_of=pad_to_multiple_of,
-            )
+
+            max_length=max_length if max_length else self.n_samples,
+            waveform = np.pad(waveform,max_length - waveform.shape[0])
+            input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
             if truncation == "fusion":
-                mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
-                input_mel = np.stack([mel, mel, mel, mel], axis=0)
-            else:
-                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
+                input_mel = np.stack([input_mel, input_mel, input_mel, input_mel], axis=0)
+                
         return input_mel, longer
 
     def __call__(

From 844540bbd2b911eafc3ff137ef4d2544004c8c02 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 16:34:01 +0000
Subject: [PATCH 027/197] fix padding

---
 src/transformers/models/clap/feature_extraction_clap.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 8d1c217a6490..3a4b4c856d77 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -241,8 +241,8 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
                 n_repeat = int(max_length / len(waveform))
                 waveform = waveform.repeat(n_repeat)
 
-            max_length=max_length if max_length else self.n_samples,
-            waveform = np.pad(waveform,max_length - waveform.shape[0])
+            max_length = max_length if max_length else self.n_samples,
+            waveform = np.pad(waveform,(0,max_length - waveform.shape[0]))
             input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
             if truncation == "fusion":
                 input_mel = np.stack([input_mel, input_mel, input_mel, input_mel], axis=0)

From aeb340ecd84cb214d0450521fc2e1f66f277d12d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 16:39:08 +0000
Subject: [PATCH 028/197] style

---
 .../models/clap/feature_extraction_clap.py    |  5 +--
 src/transformers/models/clap/modeling_clap.py | 41 +++++++++++++------
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 3a4b4c856d77..f3211a77ef28 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -241,12 +241,11 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
                 n_repeat = int(max_length / len(waveform))
                 waveform = waveform.repeat(n_repeat)
 
-            max_length = max_length if max_length else self.n_samples,
-            waveform = np.pad(waveform,(0,max_length - waveform.shape[0]))
+            waveform = np.pad(waveform, (0, max_length - waveform.shape[0]))
             input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
             if truncation == "fusion":
                 input_mel = np.stack([input_mel, input_mel, input_mel, input_mel], axis=0)
-                
+
         return input_mel, longer
 
     def __call__(
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 24a6a9a95de9..3acb0a744dd1 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -2182,7 +2182,9 @@ def _forward_features(self, hidden_states, longer_idx=None):
 
     def crop_wav(self, hidden_states, crop_size, spe_pos=None):
         time_steps = hidden_states.shape[2]
-        tx = torch.zeros(hidden_states.shape[0], hidden_states.shape[1], crop_size, hidden_states.shape[3]).to(hidden_states.device)
+        tx = torch.zeros(hidden_states.shape[0], hidden_states.shape[1], crop_size, hidden_states.shape[3]).to(
+            hidden_states.device
+        )
         for i in range(len(x)):
             if spe_pos is None:
                 crop_pos = random.randint(0, time_steps - crop_size - 1)
@@ -2197,26 +2199,39 @@ def reshape_wav2img(self, hidden_states):
 
         target_T = int(self.spec_size * self.freq_ratio)
         target_F = self.spec_size // self.freq_ratio
-        
+
         if time_steps > target_T or freq_steps > target_F:
-            raise ValueError(
-                "the wav size should less than or equal to the swin input size"
-            )
-        
+            raise ValueError("the wav size should less than or equal to the swin input size")
+
         # to avoid bicubic zero error
         if time_steps < target_T:
-            hidden_states = nn.functional.interpolate(hidden_states, (target_T, hidden_states.shape[3]), mode="bicubic", align_corners=True)
+            hidden_states = nn.functional.interpolate(
+                hidden_states, (target_T, hidden_states.shape[3]), mode="bicubic", align_corners=True
+            )
         if freq_steps < target_F:
-            hidden_states = nn.functional.interpolate(hidden_states, (hidden_states.shape[2], target_F), mode="bicubic", align_corners=True)
-    
+            hidden_states = nn.functional.interpolate(
+                hidden_states, (hidden_states.shape[2], target_F), mode="bicubic", align_corners=True
+            )
+
         # hidden_states = hidden_states.contiguous().view(hidden_states.shape[0], hidden_states.shape[1], hidden_states.shape[-1] * self.freq_ratio, hidden_states.shape[2] // self.freq_ratio)
 
         hidden_states = hidden_states.permute(0, 1, 3, 2).contiguous()
-        hidden_states = hidden_states.reshape(hidden_states.shape[0], hidden_states.shape[1], hidden_states.shape[2], self.freq_ratio, hidden_states.shape[3] // self.freq_ratio)
-        
+        hidden_states = hidden_states.reshape(
+            hidden_states.shape[0],
+            hidden_states.shape[1],
+            hidden_states.shape[2],
+            self.freq_ratio,
+            hidden_states.shape[3] // self.freq_ratio,
+        )
+
         hidden_states = hidden_states.permute(0, 1, 3, 2, 4).contiguous()
-        hidden_states = hidden_states.reshape(hidden_states.shape[0], hidden_states.shape[1], hidden_states.shape[2] * hidden_states.shape[3], hidden_states.shape[4])
-        
+        hidden_states = hidden_states.reshape(
+            hidden_states.shape[0],
+            hidden_states.shape[1],
+            hidden_states.shape[2] * hidden_states.shape[3],
+            hidden_states.shape[4],
+        )
+
         return hidden_states
 
     def forward(

From 5eaa51749d2392dec34c7ccf952f911c762d2c32 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 2 Feb 2023 17:02:07 +0000
Subject: [PATCH 029/197] use swin patch merging

---
 src/transformers/models/clap/modeling_clap.py | 73 ++++++++++++-------
 1 file changed, 46 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 3acb0a744dd1..2a708357625d 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1985,46 +1985,65 @@ def forward(self, x):
                 if not self.training:
                     attns.append(attn.unsqueeze(0))
         if self.downsample is not None:
-            x = self.downsample(x)
+            x = self.downsample(x, self.input_resolution)
         if not self.training:
             attn = torch.cat(attns, dim=0)
             attn = torch.mean(attn, dim=0)
         return x, attn
 
 
+# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging with Swin->CLAPAudio
 class CLAPAudioPatchMerging(nn.Module):
-    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+    """
+    Patch Merging Layer.
+    Args:
+        input_resolution (`Tuple[int]`):
+            Resolution of input feature.
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+
+    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
         super().__init__()
         self.input_resolution = input_resolution
         self.dim = dim
         self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
         self.norm = norm_layer(4 * dim)
 
-    def forward(self, x):
-        """
-        x: B, H*W, C
-        """
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
-
-        x = x.view(B, H, W, C)
-
-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
-        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
-        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
-
-        x = self.norm(x)
-        x = self.reduction(x)
-
-        return x
-
-    def extra_repr(self):
-        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+    def maybe_pad(self, input_feature, height, width):
+        should_pad = (height % 2 == 1) or (width % 2 == 1)
+        if should_pad:
+            pad_values = (0, 0, 0, width % 2, 0, height % 2)
+            input_feature = nn.functional.pad(input_feature, pad_values)
+
+        return input_feature
+
+    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
+        height, width = input_dimensions
+        # `dim` is height * width
+        batch_size, dim, num_channels = input_feature.shape
+
+        input_feature = input_feature.view(batch_size, height, width, num_channels)
+        # pad input to be disible by width and height, if needed
+        input_feature = self.maybe_pad(input_feature, height, width)
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_0 = input_feature[:, 0::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_1 = input_feature[:, 1::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_2 = input_feature[:, 0::2, 1::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_3 = input_feature[:, 1::2, 1::2, :]
+        # batch_size height/2 width/2 4*num_channels
+        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
+        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # batch_size height/2*width/2 4*C
+
+        input_feature = self.norm(input_feature)
+        input_feature = self.reduction(input_feature)
+
+        return input_feature
 
 
 # The Core of HTSAT

From 1182287c8b102eecbdb059077f32335ced68bb39 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 2 Feb 2023 17:44:56 +0000
Subject: [PATCH 030/197] use copied from swin

---
 .../models/clap/configuration_clap.py         |   8 +-
 src/transformers/models/clap/modeling_clap.py | 212 ++++++++++++------
 2 files changed, 153 insertions(+), 67 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 563986196a62..c33985037512 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -243,9 +243,9 @@ def __init__(
         patch_embeds_hidden_size=96,
         enable_patch_layer_norm=True,
         swin_drop_rate=0.0,
-        swin_attention_drop_rate=0.0,
+        attention_probs_dropout_prob=0.0,
         swin_drop_path_rate=0.1,
-        swin_qkv_bias=True,
+        qkv_bias=True,
         swin_norm_before_mlp="ln",
         swin_mlp_ratio=4.0,
         swin_use_checkpoint=False,
@@ -292,9 +292,9 @@ def __init__(
         self.patch_embeds_hidden_size = patch_embeds_hidden_size
         self.enable_patch_layer_norm = enable_patch_layer_norm
         self.swin_drop_rate = swin_drop_rate
-        self.swin_attention_drop_rate = swin_attention_drop_rate
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.swin_drop_path_rate = swin_drop_path_rate
-        self.swin_qkv_bias = swin_qkv_bias
+        self.qkv_bias = qkv_bias
         self.swin_norm_before_mlp = swin_norm_before_mlp
         self.swin_mlp_ratio = swin_mlp_ratio
         self.swin_use_checkpoint = swin_use_checkpoint
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 2a708357625d..f20d6e19f230 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch CLAP model."""
+import collections
 import math
 import random
 from dataclasses import dataclass
@@ -32,7 +33,7 @@
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward
+from ...pytorch_utils import apply_chunking_to_forward, meshgrid, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -472,80 +473,158 @@ def window_reverse(windows, window_size, H, W):
     return x
 
 
-class CLAPAudioWindowAttention(nn.Module):
-    def __init__(self, config, hidden_dim, num_heads):
-
+# Copied from transformers.models.swin_transformer.modeling_swin_transformer.SwinTransformerLayer with Swin->CLAPAudio
+class CLAPAudioSelfAttention(nn.Module):
+    def __init__(self, config, dim, num_heads, window_size):
         super().__init__()
-        self.hidden_dim = hidden_dim
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
         self.window_size = (
-            (config.window_size, config.window_size) if isinstance(config.window_size, int) else config.window_size
+            window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
         )
-        self.num_heads = num_heads
-        head_dim = self.hidden_dim // num_heads
-        self.scale = head_dim**-0.5
 
-        # define a parameter table of relative position bias
         self.relative_position_bias_table = nn.Parameter(
             torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
-        )  # 2*Wh-1 * 2*Ww-1, nH
+        )
 
         # get pair-wise relative position index for each token inside the window
         coords_h = torch.arange(self.window_size[0])
         coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size[0] - 1
         relative_coords[:, :, 1] += self.window_size[1] - 1
         relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index = relative_coords.sum(-1)
         self.register_buffer("relative_position_index", relative_position_index)
 
-        self.qkv = nn.Linear(self.hidden_dim, self.hidden_dim * 3, bias=config.swin_qkv_bias)
-        self.attn_drop = nn.Dropout(config.swin_attention_drop_rate)
-        self.proj = nn.Linear(self.hidden_dim, self.hidden_dim)
-        self.proj_drop = nn.Dropout(config.swin_drop_rate)
+        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
 
-        trunc_normal_(self.relative_position_bias_table, std=0.02)
-        self.softmax = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
-    def forward(self, x, mask=None):
-        """
-        Args:
-            x: input features with shape of (num_windows*B, N, C)
-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
-        """
-        B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        batch_size, dim, num_channels = hidden_states.shape
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
 
-        q = q * self.scale
-        attn = q @ k.transpose(-2, -1)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
 
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
+        relative_position_bias = relative_position_bias.view(
             self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
-        )  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
+        )
 
-        attn = self.attn_drop(attn)
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
 
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x, attn
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in SwinModel forward() function)
+            mask_shape = attention_mask.shape[0]
+            attention_scores = attention_scores.view(
+                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
+            )
+            attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
+            attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)
 
-    def extra_repr(self):
-        return f"dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}"
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->CLAPAudio
+class CLAPAudioSelfOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->CLAPAudio
+class CLAPAudioAttention(nn.Module):
+    def __init__(self, config, dim, num_heads, window_size):
+        super().__init__()
+        self.self = CLAPAudioSelfAttention(config, dim, num_heads, window_size)
+        self.output = CLAPAudioSelfOutput(config, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
 
 
 class CLAPAudioSwinTransformerBlock(nn.Module):
@@ -573,10 +652,11 @@ def __init__(
         assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
 
         self.norm1 = nn.LayerNorm(self.hidden_dim)
-        self.attn = CLAPAudioWindowAttention(
+        self.attn = CLAPAudioAttention(
             config=config,
-            hidden_dim=self.hidden_dim,
+            dim=self.hidden_dim,
             num_heads=self.num_heads,
+            window_size=self.window_size,
         )
 
         self.drop_path = CLAPDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
@@ -618,7 +698,7 @@ def __init__(
 
         self.register_buffer("attn_mask", attn_mask)
 
-    def forward(self, x):
+    def forward(self, x, output_attentions=False):
         # pdb.set_trace()
         H, W = self.input_resolution
         # print("H: ", H)
@@ -642,7 +722,8 @@ def forward(self, x):
         x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
 
         # W-MSA/SW-MSA
-        attn_windows, attn = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
+        attention_outputs = self.attn(x_windows, attention_mask=self.attn_mask, output_attentions=output_attentions)  # nW*B, window_size*window_size, C
+        attn_windows = attention_outputs[0]
 
         # merge windows
         attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
@@ -659,6 +740,11 @@ def forward(self, x):
         x = shortcut + self.drop_path(x)
         x = x + self.drop_path(self.mlp(self.norm2(x)))
 
+        if output_attentions:
+            attn = attention_outputs[1]
+        else:
+            attn = None
+
         return x, attn
 
     def extra_repr(self):
@@ -1975,18 +2061,18 @@ def __init__(self, config, idx_layer=0, patches_resolution=0):
         else:
             self.downsample = None
 
-    def forward(self, x):
+    def forward(self, x, output_attentions=False):
         attns = []
         for blk in self.blocks:
             if self.use_checkpoint:
                 x = checkpoint.checkpoint(blk, x)
             else:
-                x, attn = blk(x)
-                if not self.training:
+                x, attn = blk(x, output_attentions=output_attentions)
+                if not self.training and output_attentions:
                     attns.append(attn.unsqueeze(0))
         if self.downsample is not None:
             x = self.downsample(x, self.input_resolution)
-        if not self.training:
+        if not self.training and output_attentions:
             attn = torch.cat(attns, dim=0)
             attn = torch.mean(attn, dim=0)
         return x, attn
@@ -2064,10 +2150,10 @@ def __init__(self, config: CLAPAudioConfig):
         self.num_features = int(self.hidden_size * 2 ** (self.num_layers - 1))
 
         self.drop_rate = config.swin_drop_rate
-        self.attn_drop_rate = config.swin_attention_drop_rate
+        self.attn_drop_rate = config.attention_probs_dropout_prob
         self.drop_path_rate = config.swin_drop_path_rate
 
-        self.qkv_bias = config.swin_qkv_bias
+        self.qkv_bias = config.qkv_bias
 
         self.patch_norm = nn.LayerNorm if config.enable_patch_layer_norm else None
         self.norm_layer = nn.LayerNorm if self.patch_norm else None

From 1a4caaca8d30f32babb5c9ff0599b09fe1a6905a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 2 Feb 2023 18:23:22 +0000
Subject: [PATCH 031/197] processor with any tokenizer

---
 .../models/clap/feature_extraction_clap.py    |  2 +-
 .../models/clap/processing_clap.py            | 42 ++++---------------
 2 files changed, 8 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index f3211a77ef28..688fb49d9d88 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -241,7 +241,7 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
                 n_repeat = int(max_length / len(waveform))
                 waveform = waveform.repeat(n_repeat)
 
-            waveform = np.pad(waveform, (0, max_length - waveform.shape[0]))
+            waveform = np.pad(waveform, (0, max_length - waveform.shape[0]), mode="constant", constant_values=0)
             input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
             if truncation == "fusion":
                 input_mel = np.stack([input_mel, input_mel, input_mel, input_mel], axis=0)
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index cf6eb4725cf2..b1984acb1863 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -36,23 +36,9 @@ class CLAPProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
     feature_extractor_class = "CLAPFeatureExtractor"
-    tokenizer_class = ("CLAPTokenizer", "CLAPTokenizerFast")
-
-    def __init__(self, feature_extractor=None, tokenizer=None, **kwargs):
-        if "feature_extractor" in kwargs:
-            warnings.warn(
-                "The `feature_extractor` argument is deprecated and will be removed in v5, use `feature_extractor`"
-                " instead.",
-                FutureWarning,
-            )
-            feature_extractor = kwargs.pop("feature_extractor")
-
-        feature_extractor = feature_extractor if feature_extractor is not None else feature_extractor
-        if feature_extractor is None:
-            raise ValueError("You need to specify an `feature_extractor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")
+    tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
 
+    def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
     def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
@@ -90,6 +76,7 @@ def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
               `None`).
             - **audio_features** -- Audio features to be fed to a model. Returned when `audios` is not `None`.
         """
+        sampling_rate = kwargs.pop("sampling_rate", None)
 
         if text is None and audios is None:
             raise ValueError("You have to specify either text or audios. Both cannot be none.")
@@ -98,10 +85,12 @@ def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
             encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
 
         if audios is not None:
-            audio_features = self.feature_extractor(audios, return_tensors=return_tensors, **kwargs)
+            audio_features = self.feature_extractor(
+                audios, sampling_rate=sampling_rate, return_tensors=return_tensors, **kwargs
+            )
 
         if text is not None and audios is not None:
-            encoding["audio_features"] = audio_features.pixel_values
+            encoding["audio_features"] = audio_features.input_features
             return encoding
         elif text is not None:
             return encoding
@@ -127,20 +116,3 @@ def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         feature_extractor_input_names = self.feature_extractor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
-
-    @property
-    def feature_extractor_class(self):
-        warnings.warn(
-            "`feature_extractor_class` is deprecated and will be removed in v5. Use `feature_extractor_class`"
-            " instead.",
-            FutureWarning,
-        )
-        return self.feature_extractor_class
-
-    @property
-    def feature_extractor(self):
-        warnings.warn(
-            "`feature_extractor` is deprecated and will be removed in v5. Use `feature_extractor` instead.",
-            FutureWarning,
-        )
-        return self.feature_extractor

From e94c9dd9b0367d1295dd90836e68e66aea2af8b6 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 2 Feb 2023 18:33:49 +0000
Subject: [PATCH 032/197] more copied from

---
 .../models/clap/configuration_clap.py         |  14 +-
 src/transformers/models/clap/modeling_clap.py | 461 ++++++++++++------
 2 files changed, 315 insertions(+), 160 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index c33985037512..74b057b424df 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -225,11 +225,12 @@ def __init__(
         mel_bins=64,
         clip_samples=480000,
         spec_size=256,
-        hidden_act="relu",
+        hidden_act="gelu",
         patch_size=4,
         patch_stride=(4, 4),
         num_classes=527,
         hidden_size=96,
+        embed_dim=96,
         projection_hidden_size=768,
         depths=[2, 2, 6, 2],
         num_heads=[4, 8, 16, 32],
@@ -242,12 +243,12 @@ def __init__(
         flatten_patch_embeds=True,
         patch_embeds_hidden_size=96,
         enable_patch_layer_norm=True,
-        swin_drop_rate=0.0,
+        drop_path_rate=0.0,
         attention_probs_dropout_prob=0.0,
         swin_drop_path_rate=0.1,
         qkv_bias=True,
         swin_norm_before_mlp="ln",
-        swin_mlp_ratio=4.0,
+        mlp_ratio=4.0,
         swin_use_checkpoint=False,
         swin_absolute_positional_embedding=False,
         swin_hidden_act="gelu",
@@ -262,6 +263,7 @@ def __init__(
         spectrogram_time_stripes_num=2,
         spectrogram_freq_drop_width=8,
         spectrogram_freq_stripes_num=2,
+        layer_norm_eps=1e-5,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -278,6 +280,7 @@ def __init__(
         self.patch_stride = patch_stride
         self.num_classes = num_classes
         self.hidden_size = hidden_size
+        self.embed_dim = embed_dim
         self.depths = depths
         self.num_heads = num_heads
         self.window_size = window_size
@@ -291,12 +294,12 @@ def __init__(
         self.flatten_patch_embeds = flatten_patch_embeds
         self.patch_embeds_hidden_size = patch_embeds_hidden_size
         self.enable_patch_layer_norm = enable_patch_layer_norm
-        self.swin_drop_rate = swin_drop_rate
+        self.drop_path_rate = drop_path_rate
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.swin_drop_path_rate = swin_drop_path_rate
         self.qkv_bias = qkv_bias
         self.swin_norm_before_mlp = swin_norm_before_mlp
-        self.swin_mlp_ratio = swin_mlp_ratio
+        self.mlp_ratio = mlp_ratio
         self.swin_use_checkpoint = swin_use_checkpoint
         self.swin_absolute_positional_embedding = swin_absolute_positional_embedding
         self.patch_embed_input_channels = patch_embed_input_channels
@@ -315,6 +318,7 @@ def __init__(
         self.spectrogram_time_stripes_num = spectrogram_time_stripes_num
         self.spectrogram_freq_drop_width = spectrogram_freq_drop_width
         self.spectrogram_freq_stripes_num = spectrogram_freq_stripes_num
+        self.layer_norm_eps = layer_norm_eps
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index f20d6e19f230..c936de8030d2 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -348,28 +348,6 @@ def forward(self, x, longer_idx=None):
         return x
 
 
-class CLAPAudioMLP(nn.Module):
-    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
-
-    def __init__(self, in_features, hidden_features=None, out_features=None, config=None):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = ACT2FN[config.swin_hidden_act]
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(config.swin_drop_rate)
-
-    def forward(self, hidden_states):
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.drop(hidden_states)
-
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = self.drop(hidden_states)
-        return hidden_states
-
-
 def _no_grad_trunc_normal_(tensor, mean, std, a, b):
     # Cut & paste from PyTorch official master until it's in a few official releases - RW
     # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
@@ -626,132 +604,289 @@ def forward(
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
+    
+# Copied from transformers.models.swin.modeling_swin.SwinIntermediate with Swin->CLAPAudio
+class CLAPAudioIntermediate(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
 
-class CLAPAudioSwinTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        config,
-        input_resolution,
-        shift_size=0,
-        drop_path=0.0,
-        idx_layer=0,
-    ):
+# Copied from transformers.models.swin.modeling_swin.SwinOutput with Swin->CLAPAudio
+class CLAPAudioOutput(nn.Module):
+    def __init__(self, config, dim):
         super().__init__()
-        self.hidden_dim = config.hidden_size * 2**idx_layer
-        self.input_resolution = input_resolution
-        self.num_heads = config.num_heads[idx_layer]
+        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+
+class SwinLayer(nn.Module):
+    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.shift_size = shift_size
         self.window_size = config.window_size
-        self.mlp_ratio = config.swin_mlp_ratio
-        self.norm_before_mlp = config.swin_norm_before_mlp
-
-        if min(self.input_resolution) <= self.window_size:
+        self.input_resolution = input_resolution
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = CLAPAudioAttention(config, dim, num_heads, window_size=self.window_size)
+        self.drop_path = CLAPDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.intermediate = CLAPAudioIntermediate(config, dim)
+        self.output = CLAPAudioOutput(config, dim)
+
+    def set_shift_and_window_size(self, input_resolution):
+        if min(input_resolution) <= self.window_size:
             # if window size is larger than input resolution, we don't partition windows
             self.shift_size = 0
-            self.window_size = min(self.input_resolution)
-        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
-
-        self.norm1 = nn.LayerNorm(self.hidden_dim)
-        self.attn = CLAPAudioAttention(
-            config=config,
-            dim=self.hidden_dim,
-            num_heads=self.num_heads,
-            window_size=self.window_size,
-        )
-
-        self.drop_path = CLAPDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        if self.norm_before_mlp == "ln":
-            self.norm2 = nn.LayerNorm(self.hidden_dim)
-        elif self.norm_before_mlp == "bn":
-            self.norm2 = lambda x: nn.BatchNorm1d(self.hidden_dim)(x.transpose(1, 2)).transpose(1, 2)
-        else:
-            raise NotImplementedError
-        mlp_hidden_dim = int(self.hidden_dim * self.mlp_ratio)
-        self.mlp = CLAPAudioMLP(in_features=self.hidden_dim, hidden_features=mlp_hidden_dim, config=config)
+            self.window_size = min(input_resolution)
 
+    def get_attn_mask(self, height, width, dtype):
         if self.shift_size > 0:
             # calculate attention mask for SW-MSA
-            H, W = self.input_resolution
-            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
-            h_slices = (
+            img_mask = torch.zeros((1, height, width, 1), dtype=dtype)
+            height_slices = (
                 slice(0, -self.window_size),
                 slice(-self.window_size, -self.shift_size),
                 slice(-self.shift_size, None),
             )
-            w_slices = (
+            width_slices = (
                 slice(0, -self.window_size),
                 slice(-self.window_size, -self.shift_size),
                 slice(-self.shift_size, None),
             )
-            cnt = 0
-            for h in h_slices:
-                for w in w_slices:
-                    img_mask[:, h, w, :] = cnt
-                    cnt += 1
+            count = 0
+            for height_slice in height_slices:
+                for width_slice in width_slices:
+                    img_mask[:, height_slice, width_slice, :] = count
+                    count += 1
 
-            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = window_partition(img_mask, self.window_size)
             mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
             attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
             attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
         else:
             attn_mask = None
+        return attn_mask
 
-        self.register_buffer("attn_mask", attn_mask)
+    def maybe_pad(self, hidden_states, height, width):
+        pad_right = (self.window_size - width % self.window_size) % self.window_size
+        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+        pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
+        hidden_states = nn.functional.pad(hidden_states, pad_values)
+        return hidden_states, pad_values
 
-    def forward(self, x, output_attentions=False):
-        # pdb.set_trace()
-        H, W = self.input_resolution
-        # print("H: ", H)
-        # print("W: ", W)
-        # pdb.set_trace()
-        B, L, C = x.shape
-        # assert L == H * W, "input feature has wrong size"
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not always_partition:
+            self.set_shift_and_window_size(input_dimensions)
+        else:
+            pass
+        height, width = input_dimensions
+        batch_size, _, channels = hidden_states.size()
+        shortcut = hidden_states
+
+        hidden_states = self.layernorm_before(hidden_states)
+
+        hidden_states = hidden_states.view(batch_size, height, width, channels)
 
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(B, H, W, C)
+        # pad hidden_states to multiples of window size
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
 
+        _, height_pad, width_pad, _ = hidden_states.shape
         # cyclic shift
         if self.shift_size > 0:
-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
         else:
-            shifted_x = x
+            shifted_hidden_states = hidden_states
 
         # partition windows
-        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
-        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
+        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
+        attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)
+        if attn_mask is not None:
+            attn_mask = attn_mask.to(hidden_states_windows.device)
+
+        attention_outputs = self.attention(
+            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
+        )
 
-        # W-MSA/SW-MSA
-        attention_outputs = self.attn(x_windows, attention_mask=self.attn_mask, output_attentions=output_attentions)  # nW*B, window_size*window_size, C
-        attn_windows = attention_outputs[0]
+        attention_output = attention_outputs[0]
 
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
+        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
 
         # reverse cyclic shift
         if self.shift_size > 0:
-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        else:
-            x = shifted_x
-        x = x.view(B, H * W, C)
-
-        # FFN
-        x = shortcut + self.drop_path(x)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-
-        if output_attentions:
-            attn = attention_outputs[1]
+            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
         else:
-            attn = None
-
-        return x, attn
-
-    def extra_repr(self):
-        return (
-            f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
-            f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
-        )
+            attention_windows = shifted_windows
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            attention_windows = attention_windows[:, :height, :width, :].contiguous()
+
+        attention_windows = attention_windows.view(batch_size, height * width, channels)
+
+        hidden_states = shortcut + self.drop_path(attention_windows)
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        layer_output = hidden_states + self.output(layer_output)
+
+        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+        return layer_outputs
+
+# class CLAPAudioSwinTransformerBlock(nn.Module):
+#     def __init__(
+#         self,
+#         config,
+#         input_resolution,
+#         shift_size=0,
+#         drop_path=0.0,
+#         idx_layer=0,
+#     ):
+#         super().__init__()
+#         self.hidden_dim = config.hidden_size * 2**idx_layer
+#         self.input_resolution = input_resolution
+#         self.num_heads = config.num_heads[idx_layer]
+#         self.shift_size = shift_size
+#         self.window_size = config.window_size
+#         self.mlp_ratio = config.mlp_ratio
+#         self.norm_before_mlp = config.swin_norm_before_mlp
+
+#         if min(self.input_resolution) <= self.window_size:
+#             # if window size is larger than input resolution, we don't partition windows
+#             self.shift_size = 0
+#             self.window_size = min(self.input_resolution)
+#         assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+#         self.norm1 = nn.LayerNorm(self.hidden_dim)
+#         self.attn = CLAPAudioAttention(
+#             config=config,
+#             dim=self.hidden_dim,
+#             num_heads=self.num_heads,
+#             window_size=self.window_size,
+#         )
+
+#         self.drop_path = CLAPDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+#         if self.norm_before_mlp == "ln":
+#             self.norm2 = nn.LayerNorm(self.hidden_dim)
+#         elif self.norm_before_mlp == "bn":
+#             self.norm2 = lambda x: nn.BatchNorm1d(self.hidden_dim)(x.transpose(1, 2)).transpose(1, 2)
+#         else:
+#             raise NotImplementedError
+#         mlp_hidden_dim = int(self.hidden_dim * self.mlp_ratio)
+#         # self.mlp = CLAPAudioMLP(in_features=self.hidden_dim, hidden_features=mlp_hidden_dim, config=config)
+#         self.intermediate = CLAPAudioIntermediate(config, self.hidden_dim)
+#         self.output = CLAPAudioOutput(config, self.hidden_dim)
+
+#         if self.shift_size > 0:
+#             # calculate attention mask for SW-MSA
+#             H, W = self.input_resolution
+#             img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+#             h_slices = (
+#                 slice(0, -self.window_size),
+#                 slice(-self.window_size, -self.shift_size),
+#                 slice(-self.shift_size, None),
+#             )
+#             w_slices = (
+#                 slice(0, -self.window_size),
+#                 slice(-self.window_size, -self.shift_size),
+#                 slice(-self.shift_size, None),
+#             )
+#             cnt = 0
+#             for h in h_slices:
+#                 for w in w_slices:
+#                     img_mask[:, h, w, :] = cnt
+#                     cnt += 1
+
+#             mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+#             mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+#             attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+#             attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+#         else:
+#             attn_mask = None
+
+#         self.register_buffer("attn_mask", attn_mask)
+
+#     def forward(self, x, output_attentions=False):
+#         # pdb.set_trace()
+#         H, W = self.input_resolution
+#         # print("H: ", H)
+#         # print("W: ", W)
+#         # pdb.set_trace()
+#         B, L, C = x.shape
+#         # assert L == H * W, "input feature has wrong size"
+
+#         shortcut = x
+#         x = self.norm1(x)
+#         x = x.view(B, H, W, C)
+
+#         # cyclic shift
+#         if self.shift_size > 0:
+#             shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+#         else:
+#             shifted_x = x
+
+#         # partition windows
+#         x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+#         x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+
+#         # W-MSA/SW-MSA
+#         attention_outputs = self.attn(x_windows, attention_mask=self.attn_mask, output_attentions=output_attentions)  # nW*B, window_size*window_size, C
+#         attn_windows = attention_outputs[0]
+
+#         # merge windows
+#         attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+#         shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+
+#         # reverse cyclic shift
+#         if self.shift_size > 0:
+#             x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+#         else:
+#             x = shifted_x
+#         x = x.view(B, H * W, C)
+
+#         # FFN
+#         x = shortcut + self.drop_path(x)
+#         layer_output = self.norm2(x)
+
+#         layer_output = self.intermediate(layer_output)
+#         layer_output = x + self.output(layer_output)
+
+#         if output_attentions:
+#             attn = attention_outputs[1]
+#         else:
+#             attn = None
+
+#         return layer_output, attn
+
+#     def extra_repr(self):
+#         return (
+#             f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
+#             f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+#         )
 
 
 # contrastive loss function, adapted from
@@ -2018,38 +2153,22 @@ def forward(
         )
 
 
+
+# Copied from transformers.models.swin.modeling_swin with Swin->CLAPAudio
 class CLAPAudioLayer(nn.Module):
-    def __init__(self, config, idx_layer=0, patches_resolution=0):
+    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
         super().__init__()
-
-        hidden_dim = config.hidden_size * 2**idx_layer
-        input_resolution = (patches_resolution[0] // (2**idx_layer), patches_resolution[1] // (2**idx_layer))
-        depth = config.depths[idx_layer]
-        window_size = config.window_size
-        norm_layer = nn.LayerNorm if config.enable_patch_layer_norm else None
-
-        use_checkpoint = config.swin_use_checkpoint
-        downsample = CLAPAudioPatchMerging if (idx_layer < len(config.depths) - 1) else None
-
-        dpr = [
-            x.item() for x in torch.linspace(0, config.swin_drop_path_rate, sum(config.depths))
-        ]  # stochastic depth decay rule
-        drop_path = dpr[sum(config.depths[:idx_layer]) : sum(config.depths[: idx_layer + 1])]
-
-        # self.dim = dim
+        self.config = config
+        self.dim = dim
         self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # build blocks
         self.blocks = nn.ModuleList(
             [
-                CLAPAudioSwinTransformerBlock(
-                    config,
+                SwinLayer(
+                    config=config,
+                    dim=dim,
                     input_resolution=input_resolution,
-                    shift_size=0 if (i % 2 == 0) else window_size // 2,
-                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                    idx_layer=idx_layer,
+                    num_heads=num_heads,
+                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
                 )
                 for i in range(depth)
             ]
@@ -2057,25 +2176,44 @@ def __init__(self, config, idx_layer=0, patches_resolution=0):
 
         # patch merging layer
         if downsample is not None:
-            self.downsample = downsample(input_resolution, dim=hidden_dim, norm_layer=norm_layer)
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
         else:
             self.downsample = None
 
-    def forward(self, x, output_attentions=False):
-        attns = []
-        for blk in self.blocks:
-            if self.use_checkpoint:
-                x = checkpoint.checkpoint(blk, x)
-            else:
-                x, attn = blk(x, output_attentions=output_attentions)
-                if not self.training and output_attentions:
-                    attns.append(attn.unsqueeze(0))
+        self.pointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        height, width = input_dimensions
+        for i, layer_module in enumerate(self.blocks):
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+            )
+
+            hidden_states = layer_outputs[0]
+
+        hidden_states_before_downsampling = hidden_states
         if self.downsample is not None:
-            x = self.downsample(x, self.input_resolution)
-        if not self.training and output_attentions:
-            attn = torch.cat(attns, dim=0)
-            attn = torch.mean(attn, dim=0)
-        return x, attn
+            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+            output_dimensions = (height, width, height_downsampled, width_downsampled)
+            hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
+        else:
+            output_dimensions = (height, width, height, width)
+
+        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
+
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
 
 
 # Copied from transformers.models.swin.modeling_swin.SwinPatchMerging with Swin->CLAPAudio
@@ -2149,7 +2287,7 @@ def __init__(self, config: CLAPAudioConfig):
         self.num_layers = len(self.depths)
         self.num_features = int(self.hidden_size * 2 ** (self.num_layers - 1))
 
-        self.drop_rate = config.swin_drop_rate
+        self.drop_rate = config.drop_path_rate
         self.attn_drop_rate = config.attention_probs_dropout_prob
         self.drop_path_rate = config.swin_drop_path_rate
 
@@ -2158,7 +2296,7 @@ def __init__(self, config: CLAPAudioConfig):
         self.patch_norm = nn.LayerNorm if config.enable_patch_layer_norm else None
         self.norm_layer = nn.LayerNorm if self.patch_norm else None
         self.norm_before_mlp = config.swin_norm_before_mlp
-        self.mlp_ratio = config.swin_mlp_ratio
+        self.mlp_ratio = config.mlp_ratio
 
         self.use_checkpoint = config.swin_use_checkpoint
 
@@ -2186,10 +2324,23 @@ def __init__(self, config: CLAPAudioConfig):
 
         self.pos_drop = nn.Dropout(p=self.drop_rate)
 
+        dpr = [
+            x.item() for x in torch.linspace(0, config.swin_drop_path_rate, sum(config.depths))
+        ]
+
         # build layers
         self.layers = nn.ModuleList()
         for i_layer in range(self.num_layers):
-            layer = CLAPAudioLayer(config=config, patches_resolution=patches_resolution, idx_layer=i_layer)
+            layer = CLAPAudioLayer(
+                    config=config,
+                    dim=int(config.embed_dim * 2**i_layer),
+                    input_resolution=(patches_resolution[0] // (2**i_layer), patches_resolution[1] // (2**i_layer)),
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_heads[i_layer],
+                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=CLAPAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
+                )
+
             self.layers.append(layer)
 
         self.norm = self.norm_layer(self.num_features)
@@ -2242,7 +2393,7 @@ def _forward_features(self, hidden_states, longer_idx=None):
         hidden_states = self.pos_drop(hidden_states)
 
         for i, layer in enumerate(self.layers):
-            hidden_states, _ = layer(hidden_states)
+            hidden_states, _, _ = layer(hidden_states, layer.input_resolution)
 
         hidden_states = self.norm(hidden_states)
 

From e9ff994d85cd82722996b3871544c16f20921ce0 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 2 Feb 2023 18:40:00 +0000
Subject: [PATCH 033/197] some clean up

---
 src/transformers/models/clap/modeling_clap.py | 231 +-----------------
 1 file changed, 3 insertions(+), 228 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index c936de8030d2..00ab9aaaad87 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -394,33 +394,6 @@ def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
     return _no_grad_trunc_normal_(tensor, mean, std, a, b)
 
 
-def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
-    if mode == "fan_in":
-        denom = fan_in
-    elif mode == "fan_out":
-        denom = fan_out
-    elif mode == "fan_avg":
-        denom = (fan_in + fan_out) / 2
-
-    variance = scale / denom
-
-    if distribution == "truncated_normal":
-        # constant is stddev of standard normal truncated to (-2, 2)
-        trunc_normal_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
-    elif distribution == "normal":
-        tensor.normal_(std=math.sqrt(variance))
-    elif distribution == "uniform":
-        bound = math.sqrt(3 * variance)
-        tensor.uniform_(-bound, bound)
-    else:
-        raise ValueError(f"invalid distribution {distribution}")
-
-
-def lecun_normal_(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
-
-
 def window_partition(x, window_size):
     """
     Args:
@@ -634,8 +607,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-
-class SwinLayer(nn.Module):
+# Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->CLAPAudio
+class CLAPAudioSwinLayer(nn.Module):
     def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
@@ -757,137 +730,6 @@ def forward(
         layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
         return layer_outputs
 
-# class CLAPAudioSwinTransformerBlock(nn.Module):
-#     def __init__(
-#         self,
-#         config,
-#         input_resolution,
-#         shift_size=0,
-#         drop_path=0.0,
-#         idx_layer=0,
-#     ):
-#         super().__init__()
-#         self.hidden_dim = config.hidden_size * 2**idx_layer
-#         self.input_resolution = input_resolution
-#         self.num_heads = config.num_heads[idx_layer]
-#         self.shift_size = shift_size
-#         self.window_size = config.window_size
-#         self.mlp_ratio = config.mlp_ratio
-#         self.norm_before_mlp = config.swin_norm_before_mlp
-
-#         if min(self.input_resolution) <= self.window_size:
-#             # if window size is larger than input resolution, we don't partition windows
-#             self.shift_size = 0
-#             self.window_size = min(self.input_resolution)
-#         assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
-
-#         self.norm1 = nn.LayerNorm(self.hidden_dim)
-#         self.attn = CLAPAudioAttention(
-#             config=config,
-#             dim=self.hidden_dim,
-#             num_heads=self.num_heads,
-#             window_size=self.window_size,
-#         )
-
-#         self.drop_path = CLAPDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-#         if self.norm_before_mlp == "ln":
-#             self.norm2 = nn.LayerNorm(self.hidden_dim)
-#         elif self.norm_before_mlp == "bn":
-#             self.norm2 = lambda x: nn.BatchNorm1d(self.hidden_dim)(x.transpose(1, 2)).transpose(1, 2)
-#         else:
-#             raise NotImplementedError
-#         mlp_hidden_dim = int(self.hidden_dim * self.mlp_ratio)
-#         # self.mlp = CLAPAudioMLP(in_features=self.hidden_dim, hidden_features=mlp_hidden_dim, config=config)
-#         self.intermediate = CLAPAudioIntermediate(config, self.hidden_dim)
-#         self.output = CLAPAudioOutput(config, self.hidden_dim)
-
-#         if self.shift_size > 0:
-#             # calculate attention mask for SW-MSA
-#             H, W = self.input_resolution
-#             img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
-#             h_slices = (
-#                 slice(0, -self.window_size),
-#                 slice(-self.window_size, -self.shift_size),
-#                 slice(-self.shift_size, None),
-#             )
-#             w_slices = (
-#                 slice(0, -self.window_size),
-#                 slice(-self.window_size, -self.shift_size),
-#                 slice(-self.shift_size, None),
-#             )
-#             cnt = 0
-#             for h in h_slices:
-#                 for w in w_slices:
-#                     img_mask[:, h, w, :] = cnt
-#                     cnt += 1
-
-#             mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
-#             mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-#             attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-#             attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-#         else:
-#             attn_mask = None
-
-#         self.register_buffer("attn_mask", attn_mask)
-
-#     def forward(self, x, output_attentions=False):
-#         # pdb.set_trace()
-#         H, W = self.input_resolution
-#         # print("H: ", H)
-#         # print("W: ", W)
-#         # pdb.set_trace()
-#         B, L, C = x.shape
-#         # assert L == H * W, "input feature has wrong size"
-
-#         shortcut = x
-#         x = self.norm1(x)
-#         x = x.view(B, H, W, C)
-
-#         # cyclic shift
-#         if self.shift_size > 0:
-#             shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
-#         else:
-#             shifted_x = x
-
-#         # partition windows
-#         x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
-#         x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
-
-#         # W-MSA/SW-MSA
-#         attention_outputs = self.attn(x_windows, attention_mask=self.attn_mask, output_attentions=output_attentions)  # nW*B, window_size*window_size, C
-#         attn_windows = attention_outputs[0]
-
-#         # merge windows
-#         attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-#         shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
-
-#         # reverse cyclic shift
-#         if self.shift_size > 0:
-#             x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-#         else:
-#             x = shifted_x
-#         x = x.view(B, H * W, C)
-
-#         # FFN
-#         x = shortcut + self.drop_path(x)
-#         layer_output = self.norm2(x)
-
-#         layer_output = self.intermediate(layer_output)
-#         layer_output = x + self.output(layer_output)
-
-#         if output_attentions:
-#             attn = attention_outputs[1]
-#         else:
-#             attn = None
-
-#         return layer_output, attn
-
-#     def extra_repr(self):
-#         return (
-#             f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
-#             f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
-#         )
-
 
 # contrastive loss function, adapted from
 # https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/Clip.html
@@ -1805,61 +1647,6 @@ class CLAPPreTrainedModel(PreTrainedModel):
 
     def _init_weights(self, module):
         pass
-        # """Initialize the weights"""
-        # factor = self.config.initializer_factor
-        # if isinstance(module, CLAPTextEmbeddings):
-        #     module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
-        #     module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
-        # elif isinstance(module, CLAPVisionEmbeddings):
-        #     factor = self.config.initializer_factor
-        #     nn.init.normal_(module.class_embedding, mean=0.0, std=module.hidden_size**-0.5 * factor)
-        #     nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
-        #     nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
-        # elif isinstance(module, CLAPAttention):
-        #     factor = self.config.initializer_factor
-        #     in_proj_std = (module.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
-        #     out_proj_std = (module.hidden_size**-0.5) * factor
-        #     nn.init.normal_(module.q_proj.weight, std=in_proj_std)
-        #     nn.init.normal_(module.k_proj.weight, std=in_proj_std)
-        #     nn.init.normal_(module.v_proj.weight, std=in_proj_std)
-        #     nn.init.normal_(module.out_proj.weight, std=out_proj_std)
-        # elif isinstance(module, CLAPMLP):
-        #     factor = self.config.initializer_factor
-        #     in_proj_std = (
-        #         (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
-        #     )
-        #     fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
-        #     nn.init.normal_(module.fc1.weight, std=fc_std)
-        #     nn.init.normal_(module.fc2.weight, std=in_proj_std)
-        # elif isinstance(module, CLAPModel):
-        #     nn.init.normal_(
-        #         module.text_projection.weight,
-        #         std=module.text_hidden_size**-0.5 * self.config.initializer_factor,
-        #     )
-        #     nn.init.normal_(
-        #         module.visual_projection.weight,
-        #         std=module.vision_hidden_size**-0.5 * self.config.initializer_factor,
-        #     )
-        # elif isinstance(module, CLAPVisionModelWithProjection):
-        #     nn.init.normal_(
-        #         module.visual_projection.weight,
-        #         std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
-        #     )
-        # elif isinstance(module, CLAPTextModelWithProjection):
-        #     nn.init.normal_(
-        #         module.text_projection.weight,
-        #         std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
-        #     )
-
-        # if isinstance(module, nn.LayerNorm):
-        #     module.bias.data.zero_()
-        #     module.weight.data.fill_(1.0)
-        # if isinstance(module, nn.Linear) and module.bias is not None:
-        #     module.bias.data.zero_()
-
-    # def _set_gradient_checkpointing(self, module, value=False):
-    #     if isinstance(module, CLAPEncoder):
-    #         module.gradient_checkpointing = value
 
 
 @add_start_docstrings(CLAP_START_DOCSTRING)
@@ -2163,7 +1950,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d
         self.input_resolution = input_resolution
         self.blocks = nn.ModuleList(
             [
-                SwinLayer(
+                CLAPAudioSwinLayer(
                     config=config,
                     dim=dim,
                     input_resolution=input_resolution,
@@ -2436,18 +2223,6 @@ def _forward_features(self, hidden_states, longer_idx=None):
 
         return (framewise_output, torch.sigmoid(hidden_states), fine_grained_latent_output, latent_output)
 
-    def crop_wav(self, hidden_states, crop_size, spe_pos=None):
-        time_steps = hidden_states.shape[2]
-        tx = torch.zeros(hidden_states.shape[0], hidden_states.shape[1], crop_size, hidden_states.shape[3]).to(
-            hidden_states.device
-        )
-        for i in range(len(x)):
-            if spe_pos is None:
-                crop_pos = random.randint(0, time_steps - crop_size - 1)
-            else:
-                crop_pos = spe_pos
-            tx[i][0] = x[i, 0, crop_pos : crop_pos + crop_size, :]
-        return tx
 
     # Reshape the wavform to a img size, if you want to use the pretrained swin transformer model
     def reshape_wav2img(self, hidden_states):

From 7dec8bc33d6597b545d81be63f23bc316d8df325 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 3 Feb 2023 10:10:07 +0000
Subject: [PATCH 034/197] more refactor

---
 src/transformers/models/clap/modeling_clap.py | 356 +++++++++---------
 1 file changed, 180 insertions(+), 176 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 00ab9aaaad87..947c8c12474e 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -131,6 +131,37 @@ class CLAPTextModelOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
+
+@dataclass
+class SwinEncoderOutput(ModelOutput):
+    """
+    Swin encoder's outputs, with potential hidden states and attentions.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+
 @dataclass
 class CLAPAudioModelOutput(ModelOutput):
     """
@@ -1682,7 +1713,7 @@ def __init__(self, config: CLAPConfig):
         self.text_transform = CLAPFusionLayer(text_config)
         self.text_projection = CLAPProjectionLayer(text_config)
 
-        self.audio_model = CLAPSwinTransformer(config=audio_config)
+        self.audio_model = CLAPAudioEncoder(config=audio_config)
         self.audio_transform = CLAPFusionLayer(audio_config)
         self.audio_projection = CLAPProjectionLayer(audio_config)
 
@@ -1742,7 +1773,6 @@ def get_audio_features(
         self,
         input_features: Optional[torch.Tensor] = None,
         longer: Optional[torch.Tensor] = None,
-        waveform: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1758,7 +1788,6 @@ def get_audio_features(
         audio_outputs = self.audio_model(
             input_features=input_features,
             longer=longer,
-            waveform=waveform,
             return_dict=return_dict,
         )
 
@@ -2057,174 +2086,55 @@ def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]
         return input_feature
 
 
-# The Core of HTSAT
-class CLAPSwinTransformer(nn.Module):
-    def __init__(self, config: CLAPAudioConfig):
-        super(CLAPSwinTransformer, self).__init__()
-        self.config = config
-        self.spec_size = config.spec_size
-        self.patch_stride = config.patch_stride
-        self.window_size = config.window_size
-        self.hidden_size = config.hidden_size
-        self.depths = config.depths
-        self.use_absolute_pos_embedding = config.swin_absolute_positional_embedding
-        self.in_chans = config.input_channels
-        self.num_classes = config.num_classes
-        self.num_heads = config.num_heads
-        self.num_layers = len(self.depths)
-        self.num_features = int(self.hidden_size * 2 ** (self.num_layers - 1))
-
-        self.drop_rate = config.drop_path_rate
-        self.attn_drop_rate = config.attention_probs_dropout_prob
-        self.drop_path_rate = config.swin_drop_path_rate
-
-        self.qkv_bias = config.qkv_bias
-
-        self.patch_norm = nn.LayerNorm if config.enable_patch_layer_norm else None
-        self.norm_layer = nn.LayerNorm if self.patch_norm else None
-        self.norm_before_mlp = config.swin_norm_before_mlp
-        self.mlp_ratio = config.mlp_ratio
-
-        self.use_checkpoint = config.swin_use_checkpoint
 
-        self.enable_fusion = config.enable_fusion
-        self.fusion_type = config.fusion_type
-
-        #  process mel-spec ; used only once
-        self.freq_ratio = self.spec_size // self.config.mel_bins
-
-        self.interpolate_ratio = 32  # Downsampled ratio
-        # Spectrogram extractor
-        self.bn0 = nn.BatchNorm2d(self.config.mel_bins)
+class CLAPAudioEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_layers = len(config.depths)
 
-        # split spctrogram into non-overlapping patches
+        self.config = config
         self.patch_embed = CLAPAudioPatchEmbed(config)
+        self.enable_fusion = config.enable_fusion
+        grid_size = self.patch_embed.grid_size
+        self.patch_stride = self.patch_embed.patch_stride
+        self.spec_size = config.spec_size
+        self.freq_ratio = self.spec_size // config.mel_bins
 
-        num_patches = self.patch_embed.num_patches
-        patches_resolution = self.patch_embed.grid_size
-        self.patches_resolution = patches_resolution
-
-        # absolute position embedding
-        if self.use_absolute_pos_embedding:
-            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, self.hidden_size))
-            trunc_normal_(self.absolute_pos_embed, std=0.02)
-
-        self.pos_drop = nn.Dropout(p=self.drop_rate)
+        self.num_features = int(config.hidden_size * 2 ** (self.num_layers - 1))
+        self.freq_ratio = config.spec_size // config.mel_bins
 
-        dpr = [
-            x.item() for x in torch.linspace(0, config.swin_drop_path_rate, sum(config.depths))
-        ]
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
 
-        # build layers
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            layer = CLAPAudioLayer(
+        self.layers = nn.ModuleList(
+            [
+                CLAPAudioLayer(
                     config=config,
                     dim=int(config.embed_dim * 2**i_layer),
-                    input_resolution=(patches_resolution[0] // (2**i_layer), patches_resolution[1] // (2**i_layer)),
+                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
                     depth=config.depths[i_layer],
                     num_heads=config.num_heads[i_layer],
                     drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                     downsample=CLAPAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
                 )
-
-            self.layers.append(layer)
-
-        self.norm = self.norm_layer(self.num_features)
-        self.avgpool = nn.AdaptiveAvgPool1d(1)
-        self.maxpool = nn.AdaptiveMaxPool1d(1)
-
-        SF = self.spec_size // (2 ** (len(self.depths) - 1)) // self.patch_stride[0] // self.freq_ratio
-        self.tscam_conv = nn.Conv2d(
-            in_channels=self.num_features, out_channels=self.num_classes, kernel_size=(SF, 3), padding=(0, 1)
-        )
-        self.head = nn.Linear(self.num_classes, self.num_classes)
-
-        # self.spectrogram_extractor = Spectrogram(
-        #     n_fft=config.spectrogram_window_size,
-        #     hop_length=config.hop_size,
-        #     win_length=config.spectrogram_window_size,
-        #     window=config.spectrogram_window,
-        #     center=config.spectrogram_center,
-        #     pad_mode=config.spectrogram_pad_mode,
-        #     freeze_parameters=config.spectrogram_freeze_parameters,
-        # )
-        # # Logmel feature extractor
-        # self.logmel_extractor = LogmelFilterBank(
-        #     sr=config.sample_rate,
-        #     n_fft=config.spectrogram_window_size,
-        #     n_mels=config.mel_bins,
-        #     fmin=config.fmin,
-        #     fmax=config.fmax,
-        #     ref=config.spectrogram_ref,
-        #     amin=config.spectrogram_amin,
-        #     top_db=config.spectrogram_top_db,
-        #     freeze_parameters=config.spectrogram_freeze_parameters,
-        # )
-        # # Spec augmenter
-        # self.spec_augmenter = SpecAugmentation(
-        #     time_drop_width=config.spectrogram_time_drop_width,
-        #     time_stripes_num=config.spectrogram_time_stripes_num,
-        #     freq_drop_width=config.spectrogram_freq_drop_width,
-        #     freq_stripes_num=config.spectrogram_freq_stripes_num,
-        # )
-
-    def _forward_features(self, hidden_states, longer_idx=None):
-        _, _, frames_num, _ = hidden_states.shape
-
-        hidden_states = self.patch_embed(hidden_states, longer_idx=longer_idx)
-
-        if self.use_absolute_pos_embedding:
-            hidden_states = hidden_states + self.absolute_pos_embed
-
-        hidden_states = self.pos_drop(hidden_states)
-
-        for i, layer in enumerate(self.layers):
-            hidden_states, _, _ = layer(hidden_states, layer.input_resolution)
-
-        hidden_states = self.norm(hidden_states)
-
-        batch_size, _, n_channels = hidden_states.shape
-
-        freq_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
-        temporal_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
-
-        hidden_states = (
-            hidden_states.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
+                for i_layer in range(self.num_layers)
+            ]
         )
 
-        batch_size, n_channels, n_frequencies, n_temp = hidden_states.shape
-        # group 2D CNN
-        c_freq_bin = n_frequencies // self.freq_ratio
-        hidden_states = hidden_states.reshape(batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp)
-        hidden_states = (
-            hidden_states.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
-        )
-        # get latent_output
-        fine_grained_latent_output = torch.mean(hidden_states, dim=2)
-        fine_grained_latent_output = interpolate(
-            fine_grained_latent_output.permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
-        )
+        self.gradient_checkpointing = False
 
-        latent_output = self.avgpool(torch.flatten(hidden_states, 2))
-        latent_output = torch.flatten(latent_output, 1)
+        self.bn0 = nn.BatchNorm2d(config.mel_bins)
+        self.norm = nn.LayerNorm(self.num_features)
+        self.depths = config.depths
 
-        # display the attention map, if needed
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
 
-        hidden_states = self.tscam_conv(hidden_states)
-        hidden_states = torch.flatten(hidden_states, 2)  # B, C, T
 
-        framewise_output = interpolate(
-            torch.sigmoid(hidden_states).permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
+        SF = config.spec_size // (2 ** (len(config.depths) - 1)) // self.patch_embed.patch_stride[0] // self.freq_ratio
+        self.tscam_conv = nn.Conv2d(
+            in_channels=self.num_features, out_channels=config.num_classes, kernel_size=(SF, 3), padding=(0, 1)
         )
+        self.head = nn.Linear(config.num_classes, config.num_classes)
 
-        hidden_states = self.avgpool(hidden_states)
-        hidden_states = torch.flatten(hidden_states, 1)
-
-        return (framewise_output, torch.sigmoid(hidden_states), fine_grained_latent_output, latent_output)
-
-
-    # Reshape the wavform to a img size, if you want to use the pretrained swin transformer model
     def reshape_wav2img(self, hidden_states):
         _, _, time_steps, freq_steps = hidden_states.shape
 
@@ -2267,14 +2177,15 @@ def reshape_wav2img(self, hidden_states):
 
     def forward(
         self,
-        input_features=None,
-        longer=None,
-        waveform=None,
-        return_dict=False,
-    ):
-        if self.enable_fusion and longer.sum() == 0:
-            # if no audio is longer than 10s, then randomly select one audio to be longer
-            longer[torch.randint(0, longer.shape[0], (1,))] = True
+        input_features,
+        head_mask: Optional[torch.FloatTensor] = None,
+        longer: Optional[torch.FloatTensor]=None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, SwinEncoderOutput]:
 
         input_features = input_features.transpose(1, 3)
         hidden_states = self.bn0(input_features)
@@ -2283,28 +2194,121 @@ def forward(
         longer_list_idx = None
         if self.enable_fusion:
             longer_list = longer.to(input_features.device)
-            longer_list_idx = torch.where(longer_list)[0]
-
-        if self.training:
-            raise ValueError(
-                "CLAP does not support training since we need to enable `SpectrogramAugmentation`",
-                " this will be addressed in a future release.",
-            )
-            # x = self.spec_augmenter(x)
-            # if mixup_lambda is not None:
-            #     x = do_mixup(x, mixup_lambda)
+            longer_list_idx = torch.where(longer_list == 0)[0]
 
         hidden_states = self.reshape_wav2img(hidden_states)
-        output = self._forward_features(hidden_states, longer_idx=longer_list_idx)
 
-        if not return_dict:
-            return output
+        _, _, frames_num, _ = hidden_states.shape
+
+        
+        hidden_states = self.patch_embed(hidden_states, longer_list_idx)
 
-        framewise_output, clipwise_output, fine_grained_embedding, output_embeddingss = output
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            batch_size, _, hidden_size = hidden_states.shape
+            # rearrange b (h w) c -> b c h w
+            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            input_dimensions = layer_module.input_resolution
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+                )
+
+            hidden_states = layer_outputs[0]
+
+            hidden_states_before_downsampling = layer_outputs[1]
+            output_dimensions = layer_outputs[2]
+
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states_before_downsampling.shape
+                # rearrange b (h w) c -> b c h w
+                # here we use the original (not downsampled) height and width
+                reshaped_hidden_state = hidden_states_before_downsampling.view(
+                    batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
+                )
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states.shape
+                # rearrange b (h w) c -> b c h w
+                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+            if output_attentions:
+                all_self_attentions += layer_outputs[3:]
+        
+        hidden_states = self.norm(hidden_states)
+
+        batch_size, _, n_channels = hidden_states.shape
+
+        freq_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
+        temporal_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
+
+        hidden_states = (
+            hidden_states.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
+        )
+
+        batch_size, n_channels, n_frequencies, n_temp = hidden_states.shape
+        # group 2D CNN
+        c_freq_bin = n_frequencies // self.freq_ratio
+        hidden_states = hidden_states.reshape(batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp)
+        hidden_states = (
+            hidden_states.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
+        )
+        # get latent_output
+        fine_grained_latent_output = torch.mean(hidden_states, dim=2)
+        fine_grained_latent_output = interpolate(
+            fine_grained_latent_output.permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
+        )
+
+        latent_output = self.avgpool(torch.flatten(hidden_states, 2))
+        latent_output = torch.flatten(latent_output, 1)
+
+        # display the attention map, if needed
+
+        hidden_states = self.tscam_conv(hidden_states)
+        hidden_states = torch.flatten(hidden_states, 2)  # B, C, T
+
+        framewise_output = interpolate(
+            torch.sigmoid(hidden_states).permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
+        )
+
+        hidden_states = self.avgpool(hidden_states)
+        hidden_states = torch.flatten(hidden_states, 1)
+
+        if not return_dict:
+            return (framewise_output, torch.sigmoid(hidden_states), fine_grained_latent_output, latent_output)
 
         return CLAPAudioModelOutput(
             framewise_output=framewise_output,
-            clipwise_output=clipwise_output,
-            fine_grained_embedding=fine_grained_embedding,
-            embedding=output_embeddingss,
-        )
+            clipwise_output=torch.sigmoid(hidden_states),
+            fine_grained_embedding=fine_grained_latent_output,
+            embedding=latent_output,
+        )
\ No newline at end of file

From c3529480e007398db3c9fd63a0e23a18b5910ea3 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 3 Feb 2023 16:27:45 +0000
Subject: [PATCH 035/197] fix mel when rand_trunc

---
 .../models/clap/feature_extraction_clap.py    | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 688fb49d9d88..4091cf4e87df 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -135,9 +135,6 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[n
         """
         window = np.hanning(self.n_fft + 1)[:-1]
 
-        # TODO why don't we take the last value?
-        # window = np.hanning(self.n_fft + 1)[:-1]
-
         frames = self._fram_wave(waveform)
         stft = self._stft(frames, window=window)
 
@@ -237,15 +234,18 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
         else:
             longer = False
             # only use repeat as a new possible value for padding. you repeat the audio before applying the usual max_length padding
-            if waveform.shape[0] < max_length and padding == "repeatpad":  # do nothing if equal
-                n_repeat = int(max_length / len(waveform))
-                waveform = waveform.repeat(n_repeat)
-
-            waveform = np.pad(waveform, (0, max_length - waveform.shape[0]), mode="constant", constant_values=0)
-            input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
+            if waveform.shape[0] < max_length:
+                if padding == "repeatpad":
+                    n_repeat = int(max_length / len(waveform))
+                    waveform = waveform.repeat(n_repeat)
+                waveform = np.pad(waveform, (0, max_length - waveform.shape[0]), mode="constant", constant_values=0)
+            
             if truncation == "fusion":
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters)
                 input_mel = np.stack([input_mel, input_mel, input_mel, input_mel], axis=0)
-
+            else:
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
+                
         return input_mel, longer
 
     def __call__(

From f2d6ffe7b57ef5bd12a7f5467df60729f48dab24 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 3 Feb 2023 16:28:44 +0000
Subject: [PATCH 036/197] style

---
 .../models/clap/feature_extraction_clap.py    |  4 +--
 src/transformers/models/clap/modeling_clap.py | 26 ++++++++-----------
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 4091cf4e87df..d9731e695bba 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -239,13 +239,13 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
                     n_repeat = int(max_length / len(waveform))
                     waveform = waveform.repeat(n_repeat)
                 waveform = np.pad(waveform, (0, max_length - waveform.shape[0]), mode="constant", constant_values=0)
-            
+
             if truncation == "fusion":
                 input_mel = self._np_extract_fbank_features(waveform, self.mel_filters)
                 input_mel = np.stack([input_mel, input_mel, input_mel, input_mel], axis=0)
             else:
                 input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
-                
+
         return input_mel, longer
 
     def __call__(
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 00ab9aaaad87..4e5c68e19180 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -33,7 +33,7 @@
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, meshgrid, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -577,7 +577,7 @@ def forward(
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
-    
+
 # Copied from transformers.models.swin.modeling_swin.SwinIntermediate with Swin->CLAPAudio
 class CLAPAudioIntermediate(nn.Module):
     def __init__(self, config, dim):
@@ -1940,7 +1940,6 @@ def forward(
         )
 
 
-
 # Copied from transformers.models.swin.modeling_swin with Swin->CLAPAudio
 class CLAPAudioLayer(nn.Module):
     def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
@@ -2111,22 +2110,20 @@ def __init__(self, config: CLAPAudioConfig):
 
         self.pos_drop = nn.Dropout(p=self.drop_rate)
 
-        dpr = [
-            x.item() for x in torch.linspace(0, config.swin_drop_path_rate, sum(config.depths))
-        ]
+        dpr = [x.item() for x in torch.linspace(0, config.swin_drop_path_rate, sum(config.depths))]
 
         # build layers
         self.layers = nn.ModuleList()
         for i_layer in range(self.num_layers):
             layer = CLAPAudioLayer(
-                    config=config,
-                    dim=int(config.embed_dim * 2**i_layer),
-                    input_resolution=(patches_resolution[0] // (2**i_layer), patches_resolution[1] // (2**i_layer)),
-                    depth=config.depths[i_layer],
-                    num_heads=config.num_heads[i_layer],
-                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
-                    downsample=CLAPAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
-                )
+                config=config,
+                dim=int(config.embed_dim * 2**i_layer),
+                input_resolution=(patches_resolution[0] // (2**i_layer), patches_resolution[1] // (2**i_layer)),
+                depth=config.depths[i_layer],
+                num_heads=config.num_heads[i_layer],
+                drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                downsample=CLAPAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
+            )
 
             self.layers.append(layer)
 
@@ -2223,7 +2220,6 @@ def _forward_features(self, hidden_states, longer_idx=None):
 
         return (framewise_output, torch.sigmoid(hidden_states), fine_grained_latent_output, latent_output)
 
-
     # Reshape the wavform to a img size, if you want to use the pretrained swin transformer model
     def reshape_wav2img(self, hidden_states):
         _, _, time_steps, freq_steps = hidden_states.shape

From e71749c5b69b6f5b5725647ec6f3b4b0a0a782bb Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 3 Feb 2023 16:29:22 +0000
Subject: [PATCH 037/197] remove unused imports

---
 src/transformers/models/clap/feature_extraction_clap.py | 1 -
 src/transformers/models/clap/modeling_clap.py           | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index d9731e695bba..4bde80242d2e 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -21,7 +21,6 @@
 import numpy as np
 import torchvision
 
-from ... import __version__
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 4e5c68e19180..854caf71e44d 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -15,16 +15,13 @@
 """ PyTorch CLAP model."""
 import collections
 import math
-import random
 from dataclasses import dataclass
 from typing import Any, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 import torch.nn.functional as F
-import torch.utils.checkpoint as checkpoint
 from torch import nn
-from torch.nn.init import _calculate_fan_in_and_fan_out
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (

From f1627c6a0ffcf212e718704bb327962b0f505873 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 3 Feb 2023 16:30:27 +0000
Subject: [PATCH 038/197] update processing

---
 src/transformers/models/clap/processing_clap.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index b1984acb1863..d811f10eb6f9 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -16,8 +16,6 @@
 audio/Text processor class for CLAP
 """
 
-import warnings
-
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 

From 612e46f4848d08d7378ac6105bb12598d405fca4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 3 Feb 2023 16:35:34 +0000
Subject: [PATCH 039/197] remove image processing tests

---
 .../models/clap/test_image_processing_clap.py | 305 ------------------
 1 file changed, 305 deletions(-)
 delete mode 100644 tests/models/clap/test_image_processing_clap.py

diff --git a/tests/models/clap/test_image_processing_clap.py b/tests/models/clap/test_image_processing_clap.py
deleted file mode 100644
index 93a43e5096e2..000000000000
--- a/tests/models/clap/test_image_processing_clap.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingSavingTestMixin
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import CLAPImageProcessor
-
-
-class CLAPImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=[0.48145466, 0.4578275, 0.40821073],
-        image_std=[0.26862954, 0.26130258, 0.27577711],
-        do_convert_rgb=True,
-    ):
-        size = size if size is not None else {"shortest_edge": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_convert_rgb = do_convert_rgb
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_convert_rgb": self.do_convert_rgb,
-        }
-
-    def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-        if equal_resolution:
-            image_inputs = []
-            for i in range(self.batch_size):
-                image_inputs.append(
-                    np.random.randint(
-                        255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
-                    )
-                )
-        else:
-            image_inputs = []
-            for i in range(self.batch_size):
-                width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2)
-                image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8))
-
-        if not numpify and not torchify:
-            # PIL expects the channel dimension as last dimension
-            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        if torchify:
-            image_inputs = [torch.from_numpy(x) for x in image_inputs]
-
-        return image_inputs
-
-
-@require_torch
-@require_vision
-class CLAPImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
-    image_processing_class = CLAPImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = CLAPImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "center_crop"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 20})
-        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
-        self.assertEqual(image_processor.size, {"shortest_edge": 42})
-        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        image_inputs = self.image_processor_tester.prepare_inputs(equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-
-@require_torch
-@require_vision
-class CLAPImageProcessingTestFourChannels(ImageProcessingSavingTestMixin, unittest.TestCase):
-
-    image_processing_class = CLAPImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = CLAPImageProcessingTester(self, num_channels=4)
-        self.expected_encoded_image_num_channels = 3
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "center_crop"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil_four_channels(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.expected_encoded_image_num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.expected_encoded_image_num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )

From c3042e2c7ec1272163ac76d3b67b8906d0c54a6b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 3 Feb 2023 17:12:43 +0000
Subject: [PATCH 040/197] add testing fiel

---
 .../clap/test_feature_extraction_clap.py      | 246 ++++++++++++++++++
 1 file changed, 246 insertions(+)
 create mode 100644 tests/models/clap/test_feature_extraction_clap.py

diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
new file mode 100644
index 000000000000..75b049b4cecd
--- /dev/null
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -0,0 +1,246 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import itertools
+import os
+import random
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import is_speech_available
+from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio
+from transformers.utils.import_utils import is_torch_available
+
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+if is_speech_available():
+    from transformers import CLAPFeatureExtractor
+
+if is_torch_available():
+    import torch
+
+global_rng = random.Random()
+
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+@require_torch
+@require_torchaudio
+# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester with Whisper->CLAP
+class CLAPFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=10,
+        hop_length=160,
+        chunk_length=8,
+        padding_value=0.0,
+        sampling_rate=4_000,
+        return_attention_mask=False,
+        do_normalize=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.return_attention_mask = return_attention_mask
+        self.do_normalize = do_normalize
+        self.feature_size = feature_size
+        self.chunk_length = chunk_length
+        self.hop_length = hop_length
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "hop_length": self.hop_length,
+            "chunk_length": self.chunk_length,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "return_attention_mask": self.return_attention_mask,
+            "do_normalize": self.do_normalize,
+        }
+
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_length:
+            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                floats_list((x, self.feature_size))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+        return speech_inputs
+
+
+@require_torch
+@require_torchaudio
+# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest with Whisper->CLAP
+class CLAPFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+
+    feature_extraction_class = CLAPFeatureExtractor if is_speech_available() else None
+
+    def setUp(self):
+        self.feat_extract_tester = CLAPFeatureExtractionTester(self)
+
+    def test_feat_extract_from_and_save_pretrained(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
+            check_json_file_has_correct_format(saved_file)
+            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = dict_first.pop("mel_filters")
+        mel_2 = dict_second.pop("mel_filters")
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+
+    def test_feat_extract_to_json_file(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
+            feat_extract_first.to_json_file(json_file_path)
+            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = dict_first.pop("mel_filters")
+        mel_2 = dict_second.pop("mel_filters")
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # Test feature size
+        input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
+        self.assertTrue(input_features.ndim == 3)
+        self.assertTrue(input_features.shape[-1] == feature_extractor.nb_max_frames)
+        self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size)
+
+        # Test not batched input
+        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+        # Test truncation required
+        speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples + 500), 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        speech_inputs_truncated = [x[: feature_extractor.n_samples] for x in speech_inputs]
+        np_speech_inputs_truncated = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
+
+        encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_double_precision_pad(self):
+        import torch
+
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
+        py_speech_inputs = np_speech_inputs.tolist()
+
+        for inputs in [py_speech_inputs, np_speech_inputs]:
+            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
+            self.assertTrue(np_processed.input_features.dtype == np.float32)
+            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
+            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def integration_test_rand_trunc(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                0.1193, -0.0946, -0.1098, -0.0196, 0.0225, -0.0690, -0.1736, 0.0951,
+                0.0971, -0.0817, -0.0702, 0.0162, 0.0260, 0.0017, -0.0192, -0.1678,
+                0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554,
+                -0.0274, -0.1425, -0.1423, 0.0837, 0.0377, -0.0854
+            ]
+        )
+        # fmt: on
+
+        input_speech = self._load_datasamples(1)
+        feaure_extractor = CLAPFeatureExtractor()
+        input_features = feaure_extractor(input_speech, return_tensors="pt").input_features
+        self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
+
+
+    def integration_test_fusion(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                0.1193, -0.0946, -0.1098, -0.0196, 0.0225, -0.0690, -0.1736, 0.0951,
+                0.0971, -0.0817, -0.0702, 0.0162, 0.0260, 0.0017, -0.0192, -0.1678,
+                0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554,
+                -0.0274, -0.1425, -0.1423, 0.0837, 0.0377, -0.0854
+            ]
+        )
+        # fmt: on
+
+        input_speech = self._load_datasamples(1)
+        feaure_extractor = CLAPFeatureExtractor()
+        input_features = feaure_extractor(input_speech, return_tensors="pt", truncation = "rand_trunc").input_features
+        self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
+        
+        # TODO test fusion with a longer audio
\ No newline at end of file

From 68cbadb6ebebacbd77247cc5b71b633bbced0fea Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 3 Feb 2023 17:20:34 +0000
Subject: [PATCH 041/197] fixmodeling issues

---
 src/transformers/models/clap/modeling_clap.py | 43 +++++++++++++------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 023eb76473dd..45745d53fa1c 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -128,7 +128,6 @@ class CLAPTextModelOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-
 @dataclass
 class SwinEncoderOutput(ModelOutput):
     """
@@ -158,7 +157,6 @@ class SwinEncoderOutput(ModelOutput):
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
-
 @dataclass
 class CLAPAudioModelOutput(ModelOutput):
     """
@@ -452,7 +450,7 @@ def window_reverse(windows, window_size, H, W):
     return x
 
 
-# Copied from transformers.models.swin_transformer.modeling_swin_transformer.SwinTransformerLayer with Swin->CLAPAudio
+# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->CLAPAudio
 class CLAPAudioSelfAttention(nn.Module):
     def __init__(self, config, dim, num_heads, window_size):
         super().__init__()
@@ -524,7 +522,7 @@ def forward(
         attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
 
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in SwinModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in CLAPAudioModel forward() function)
             mask_shape = attention_mask.shape[0]
             attention_scores = attention_scores.view(
                 batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
@@ -635,7 +633,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->CLAPAudio
+# Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->CLAPAudio, SwinDropPath->CLAPDropPath
 class CLAPAudioSwinLayer(nn.Module):
     def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
         super().__init__()
@@ -1195,6 +1193,24 @@ def __init__(self, config, position_embedding_type=None):
         self.output = CLAPTextSelfOutput(config)
         self.pruned_heads = set()
 
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -1661,7 +1677,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPPreTrainedModel with CLIP->CLAP,clip->clap
 class CLAPPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -1981,10 +1996,10 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d
                     input_resolution=input_resolution,
                     num_heads=num_heads,
                     shift_size=0 if (i % 2 == 0) else config.window_size // 2,
-                )
+)
                 for i in range(depth)
-            ]
-        )
+]
+    )
 
         # patch merging layer
         if downsample is not None:
@@ -2111,7 +2126,7 @@ def __init__(self, config):
                     num_heads=config.num_heads[i_layer],
                     drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                     downsample=CLAPAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
-                )
+            )
                 for i_layer in range(self.num_layers)
             ]
         )
@@ -2196,7 +2211,7 @@ def forward(
 
         _, _, frames_num, _ = hidden_states.shape
 
-        
+
         hidden_states = self.patch_embed(hidden_states, longer_list_idx)
 
         all_hidden_states = () if output_hidden_states else None
@@ -2259,7 +2274,7 @@ def custom_forward(*inputs):
 
             if output_attentions:
                 all_self_attentions += layer_outputs[3:]
-        
+
         hidden_states = self.norm(hidden_states)
 
         batch_size, _, n_channels = hidden_states.shape
@@ -2269,7 +2284,7 @@ def custom_forward(*inputs):
 
         hidden_states = (
             hidden_states.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
-        )
+)
 
         batch_size, n_channels, n_frequencies, n_temp = hidden_states.shape
         # group 2D CNN
@@ -2277,7 +2292,7 @@ def custom_forward(*inputs):
         hidden_states = hidden_states.reshape(batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp)
         hidden_states = (
             hidden_states.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
-        )
+    )
         # get latent_output
         fine_grained_latent_output = torch.mean(hidden_states, dim=2)
         fine_grained_latent_output = interpolate(

From c20c66ef87417f8dc04a039a95eb090e80030f08 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 3 Feb 2023 17:25:19 +0000
Subject: [PATCH 042/197] replace with `is_longer`

---
 src/transformers/models/clap/modeling_clap.py | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 45745d53fa1c..31f5194dc5ab 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -332,7 +332,7 @@ def __init__(self, config: CLAPAudioConfig):
                 padding=padding,
             )
 
-    def forward(self, x, longer_idx=None):
+    def forward(self, x, is_longer_idx=None):
         if self.enable_fusion:
             global_x = x[:, 0:1, :, :]
 
@@ -343,9 +343,9 @@ def forward(self, x, longer_idx=None):
             ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
             global_x = self.proj(global_x)
             TW = global_x.size(-1)
-            if len(longer_idx) > 0:
+            if len(is_longer_idx) > 0:
                 # local processing
-                local_x = x[longer_idx, 1:, :, :].contiguous()
+                local_x = x[is_longer_idx, 1:, :, :].contiguous()
                 B, C, H, W = local_x.shape
                 local_x = local_x.view(B * C, 1, H, W)
                 local_x = self.mel_conv2d(local_x)
@@ -359,7 +359,7 @@ def forward(self, x, longer_idx=None):
                 else:
                     local_x = local_x[:, :, :, :TW]
 
-                global_x[longer_idx] = self.fusion_model(global_x[longer_idx], local_x)
+                global_x[is_longer_idx] = self.fusion_model(global_x[is_longer_idx], local_x)
             x = global_x
         else:
             B, C, H, W = x.shape
@@ -1784,7 +1784,7 @@ def get_text_features(
     def get_audio_features(
         self,
         input_features: Optional[torch.Tensor] = None,
-        longer: Optional[torch.Tensor] = None,
+        is_longer: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1799,7 +1799,7 @@ def get_audio_features(
 
         audio_outputs = self.audio_model(
             input_features=input_features,
-            longer=longer,
+            is_longer=is_longer,
             return_dict=return_dict,
         )
 
@@ -2190,7 +2190,7 @@ def forward(
         self,
         input_features,
         head_mask: Optional[torch.FloatTensor] = None,
-        longer: Optional[torch.FloatTensor]=None,
+        is_longer: Optional[torch.FloatTensor]=None,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         output_hidden_states_before_downsampling: Optional[bool] = False,
@@ -2202,17 +2202,17 @@ def forward(
         hidden_states = self.bn0(input_features)
         hidden_states = hidden_states.transpose(1, 3)
 
-        longer_list_idx = None
+        is_longer_list_idx = None
         if self.enable_fusion:
-            longer_list = longer.to(input_features.device)
-            longer_list_idx = torch.where(longer_list == 0)[0]
+            is_longer_list = is_longer.to(input_features.device)
+            is_longer_list_idx = torch.where(is_longer_list == 0)[0]
 
         hidden_states = self.reshape_wav2img(hidden_states)
 
         _, _, frames_num, _ = hidden_states.shape
 
 
-        hidden_states = self.patch_embed(hidden_states, longer_list_idx)
+        hidden_states = self.patch_embed(hidden_states, is_longer_list_idx)
 
         all_hidden_states = () if output_hidden_states else None
         all_reshaped_hidden_states = () if output_hidden_states else None
@@ -2284,7 +2284,7 @@ def custom_forward(*inputs):
 
         hidden_states = (
             hidden_states.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
-)
+        )
 
         batch_size, n_channels, n_frequencies, n_temp = hidden_states.shape
         # group 2D CNN

From 9c03ac787b7aca791bcffdaf74b14b6193b29d99 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sat, 4 Feb 2023 18:10:43 +0000
Subject: [PATCH 043/197] clap in serialization

---
 docs/source/en/serialization.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 2790a87ea6e0..85d3f6bafa21 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -64,7 +64,7 @@ Ready-made configurations include the following architectures:
 - BLOOM
 - CamemBERT
 - Chinese-CLIP
-- clap
+- CLAP
 - CLIP
 - CodeGen
 - Conditional DETR

From 6717d728514f4654613d42d9fb462b8fed5b9d37 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 6 Feb 2023 09:20:42 +0000
Subject: [PATCH 044/197] more refactor

---
 src/transformers/models/clap/modeling_clap.py | 266 ++++++++----------
 1 file changed, 120 insertions(+), 146 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 31f5194dc5ab..05e308ab425d 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -43,44 +43,89 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "laion-ai/base"
+_CHECKPOINT_FOR_DOC = "laion-ai/clap-htst-unfused-base"
 
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "laion-ai/base",
+    "laion-ai/clap-htst-unfused-base",
     # See all clap models at https://huggingface.co/models?filter=clap
 ]
 
 
-def do_mixup(x, mixup_lambda):
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L176
+def do_mixup(hidden_states, mixup_lambda):
     """
+    MIXUP is a data augmentation method, proposed by Hongyi Zhang et al on 25 Oct. 2017. https://arxiv.org/abs/1710.09412 
+    Based on the mixing ratio sampled from the Beta distribution, it is a method of expanding data by mixing both input and output. 
+    By using this, it is said that generalization performance improves because the decision boundary becomes smooth.
+
     Args:
-      x: (batch_size , ...)
-      mixup_lambda: (batch_size,)
-    Returns:
-      out: (batch_size, ...)
+        hidden_states: (`torch.FloatTensor` of shape (batch_size, seq_length, hidden_size))
+            Input hidden states
+        mixup_lambda: (`torch.FloatTensor`)
+            Mixing ratio sampled from the Beta distribution
     """
     out = (
-        x.transpose(0, -1) * mixup_lambda + torch.flip(x, dims=[0]).transpose(0, -1) * (1 - mixup_lambda)
+        hidden_states.transpose(0, -1) * mixup_lambda + torch.flip(hidden_states, dims=[0]).transpose(0, -1) * (1 - mixup_lambda)
     ).transpose(0, -1)
     return out
 
 
-def interpolate(x, ratio):
-    """Interpolate data in time domain. This is used to compensate the
+# Adapted from: https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L191
+def interpolate(hidden_states, ratio):
+    """
+    Interpolate data in time domain. This is used to compensate the
     resolution reduction in downsampling of a CNN.
 
     Args:
-      x: (batch_size, time_steps, classes_num)
-      ratio: int, ratio to interpolate
-    Returns:
-      upsampled: (batch_size, time_steps * ratio, classes_num)
+        hidden_states: (`torch.FloatTensor` of shape (batch_size, time_steps, classes_num))
+            Input hidden states
+        ratio: (`int`)
+            The ratio of the length of the output to the length of the input.
     """
-    (batch_size, time_steps, classes_num) = x.shape
-    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+    (batch_size, time_steps, classes_num) = hidden_states.shape
+    upsampled = hidden_states[:, :, None, :].repeat(1, 1, ratio, 1)
     upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
     return upsampled
 
 
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L249
+def window_partition(hidden_states, window_size):
+    """
+    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size, num_channels)`
+
+    Args:
+        hidden_states: (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`)
+            Input hidden states
+        window_size: (`int`)
+            Window size
+    """
+    batch_size, height, width, num_channels = hidden_states.shape
+    
+    hidden_states = hidden_states.view(batch_size, height // window_size, window_size, width // window_size, window_size, num_channels)
+    windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
+    return windows
+
+
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L263
+def window_reverse(windows, window_size, height, width):
+    """
+    Args:
+        windows: (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`)
+            Input windows
+        window_size: (`int`)
+            Window size
+        height: (`int`)
+            Height of the resized image
+        width: (`int`)
+            Width of the resized image
+    """
+    batch_size = int(windows.shape[0] / (height * width / window_size / window_size))
+
+    hidden_states = windows.view(batch_size, height // window_size, width // window_size, window_size, window_size, -1)
+    hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().view(batch_size, height, width, -1)
+    return hidden_states
+
+
 # Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
 def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
     """
@@ -98,6 +143,19 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l
     return incremental_indices.long() + padding_idx
 
 
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/Clip.html
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
+
+
+# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->clap
+def clap_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+
+
 @dataclass
 # Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->CLAP
 class CLAPTextModelOutput(ModelOutput):
@@ -129,6 +187,7 @@ class CLAPTextModelOutput(ModelOutput):
 
 
 @dataclass
+# Copied from transformers.models.swin.modeling_swin.SwinEncoderOutput
 class SwinEncoderOutput(ModelOutput):
     """
     Swin encoder's outputs, with potential hidden states and attentions.
@@ -160,7 +219,7 @@ class SwinEncoderOutput(ModelOutput):
 @dataclass
 class CLAPAudioModelOutput(ModelOutput):
     """
-    Base class for text model's outputs that also contains a pooling of the last hidden states.
+    CLAPAudio model output to mimic the output of the original implementation.
 
     Args:
         framewise_output (`torch.FloatTensor` of shape `(batch_size, num_frames, hidden_size)`):
@@ -217,13 +276,11 @@ def to_tuple(self) -> Tuple[Any]:
         )
 
 
+# Adapted from transformers.models.swin.modeling_swin.SwinDropPath
 class CLAPDropPath(nn.Module):
     """
-    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the
-    DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop
-    Connect' is a different form of dropout in a separate paper... See discussion:
-    https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the layer and
-    argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the argument.
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a 
+    slightly refactored version of the `SwinDropPath` implementation.
     """
 
     def __init__(self, drop_prob=None):
@@ -332,122 +389,52 @@ def __init__(self, config: CLAPAudioConfig):
                 padding=padding,
             )
 
-    def forward(self, x, is_longer_idx=None):
+    def forward(self, hidden_states, is_longer_idx=None):
         if self.enable_fusion:
-            global_x = x[:, 0:1, :, :]
+            global_hidden_states = hidden_states[:, 0:1, :, :]
 
             # global processing
-            B, C, H, W = global_x.shape
-            assert (
-                H == self.img_size[0] and W == self.img_size[1]
-            ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-            global_x = self.proj(global_x)
-            TW = global_x.size(-1)
+            batch_size, num_channels, height, width = global_hidden_states.shape
+
+            if height != self.img_size[0] or width != self.img_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+                )
+            
+            global_hidden_states = self.proj(global_hidden_states)
+            output_width = global_hidden_states.size(-1)
             if len(is_longer_idx) > 0:
                 # local processing
-                local_x = x[is_longer_idx, 1:, :, :].contiguous()
-                B, C, H, W = local_x.shape
-                local_x = local_x.view(B * C, 1, H, W)
-                local_x = self.mel_conv2d(local_x)
-                local_x = local_x.view(B, C, local_x.size(1), local_x.size(2), local_x.size(3))
-                local_x = local_x.permute((0, 2, 3, 1, 4)).contiguous().flatten(3)
-                TB, TC, TH, _ = local_x.size()
-                if local_x.size(-1) < TW:
-                    local_x = torch.cat(
-                        [local_x, torch.zeros((TB, TC, TH, TW - local_x.size(-1)), device=global_x.device)], dim=-1
+                local_hidden_states = hidden_states[is_longer_idx, 1:, :, :].contiguous()
+                batch_size, num_channels, height, width = local_hidden_states.shape
+
+                local_hidden_states = local_hidden_states.view(batch_size * num_channels, 1, height, width)
+                local_hidden_states = self.mel_conv2d(local_hidden_states)
+                local_hidden_states = local_hidden_states.view(batch_size, num_channels, local_hidden_states.size(1), local_hidden_states.size(2), local_hidden_states.size(3))
+                local_hidden_states = local_hidden_states.permute((0, 2, 3, 1, 4)).contiguous().flatten(3)
+                output_batch_size, output_num_channels, output_height, _ = local_hidden_states.size()
+                
+                if local_hidden_states.size(-1) < output_width:
+                    local_hidden_states = torch.cat(
+                        [local_hidden_states, torch.zeros((output_batch_size, output_num_channels, output_height, output_width - local_hidden_states.size(-1)), device=global_hidden_states.device)], dim=-1
                     )
                 else:
-                    local_x = local_x[:, :, :, :TW]
+                    local_hidden_states = local_hidden_states[:, :, :, :output_width]
 
-                global_x[is_longer_idx] = self.fusion_model(global_x[is_longer_idx], local_x)
-            x = global_x
+                global_hidden_states[is_longer_idx] = self.fusion_model(global_hidden_states[is_longer_idx], local_hidden_states)
+            hidden_states = global_hidden_states
         else:
-            B, C, H, W = x.shape
-            assert (
-                H == self.img_size[0] and W == self.img_size[1]
-            ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-            x = self.proj(x)
+            _, _, height, width = hidden_states.shape
+            if height != self.img_size[0] or width != self.img_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+                )
+            hidden_states = self.proj(hidden_states)
 
         if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        x = self.norm(x)
-        return x
-
-
-def _no_grad_trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
-
-    with torch.no_grad():
-        # Values are generated by using a truncated uniform distribution and
-        # then using the inverse CDF for the normal distribution.
-        # Get upper and lower cdf values
-        l = norm_cdf((a - mean) / std)
-        u = norm_cdf((b - mean) / std)
-
-        # Uniformly fill tensor with values from [l, u], then translate to
-        # [2l-1, 2u-1].
-        tensor.uniform_(2 * l - 1, 2 * u - 1)
-
-        # Use inverse cdf transform for normal distribution to get truncated
-        # standard normal
-        tensor.erfinv_()
-
-        # Transform to proper mean, std
-        tensor.mul_(std * math.sqrt(2.0))
-        tensor.add_(mean)
-
-        # Clamp to ensure it's in the proper range
-        tensor.clamp_(min=a, max=b)
-        return tensor
-
-
-def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
-    # type: (Tensor, float, float, float, float) -> Tensor
-    r"""Fills the input Tensor with values drawn from a truncated
-    Args:
-    normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean},
-    \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for
-    generating the random values works best when :math:`a \leq \text{mean} \leq b`.
-        tensor: an n-dimensional `torch.Tensor` mean: the mean of the normal distribution std: the standard deviation
-        of the normal distribution a: the minimum cutoff value b: the maximum cutoff value
-    Examples:
-        >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w)
-    """
-    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
-
-
-def window_partition(x, window_size):
-    """
-    Args:
-        x: (B, H, W, C)
-        window_size (int): window size
-    Returns:
-        windows: (num_windows*B, window_size, window_size, C)
-    """
-    B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    return windows
-
-
-def window_reverse(windows, window_size, H, W):
-    """
-    Args:
-        windows: (num_windows*B, window_size, window_size, C)
-        window_size (int): Window size
-        H (int): Height of image
-        W (int): Width of image
-    Returns:
-        x: (B, H, W, C)
-    """
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-    return x
+            hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
 
 
 # Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->CLAPAudio
@@ -757,19 +744,6 @@ def forward(
         return layer_outputs
 
 
-# contrastive loss function, adapted from
-# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/Clip.html
-def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
-    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
-
-
-# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->clap
-def clap_loss(similarity: torch.Tensor) -> torch.Tensor:
-    caption_loss = contrastive_loss(similarity)
-    image_loss = contrastive_loss(similarity.t())
-    return (caption_loss + image_loss) / 2.0
-
-
 @dataclass
 # Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->CLAP
 class CLAPVisionModelOutput(ModelOutput):
@@ -1752,8 +1726,8 @@ def get_text_features(
         ```python
         >>> from transformers import AutoTokenizer, CLAPModel
 
-        >>> model = CLAPModel.from_pretrained("laion-ai/base")
-        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/base")
+        >>> model = CLAPModel.from_pretrained("laion-ai/clap-htst-unfused-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htst-unfused-base")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
@@ -1833,8 +1807,8 @@ def forward(
         >>> import requests
         >>> from transformers import AutoProcessor, CLAPModel
 
-        >>> model = CLAPModel.from_pretrained("laion-ai/base")
-        >>> processor = AutoProcessor.from_pretrained("laion-ai/base")
+        >>> model = CLAPModel.from_pretrained("laion-ai/clap-htst-unfused-base")
+        >>> processor = AutoProcessor.from_pretrained("laion-ai/clap-htst-unfused-base")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1946,8 +1920,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, CLAPTextModelWithProjection
 
-        >>> model = CLAPTextModelWithProjection.from_pretrained("laion-ai/base")
-        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/base")
+        >>> model = CLAPTextModelWithProjection.from_pretrained("laion-ai/clap-htst-unfused-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htst-unfused-base")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
 

From a38a0163f80a510c512a1bea97539dc83909ccfe Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 6 Feb 2023 09:36:03 +0000
Subject: [PATCH 045/197] `make fixup`

---
 src/transformers/models/clap/modeling_clap.py | 97 ++++++++++++-------
 .../clap/test_feature_extraction_clap.py      |  9 +-
 2 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 05e308ab425d..88d9e1452737 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -54,9 +54,10 @@
 # Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L176
 def do_mixup(hidden_states, mixup_lambda):
     """
-    MIXUP is a data augmentation method, proposed by Hongyi Zhang et al on 25 Oct. 2017. https://arxiv.org/abs/1710.09412 
-    Based on the mixing ratio sampled from the Beta distribution, it is a method of expanding data by mixing both input and output. 
-    By using this, it is said that generalization performance improves because the decision boundary becomes smooth.
+    MIXUP is a data augmentation method, proposed by Hongyi Zhang et al on 25 Oct. 2017.
+    https://arxiv.org/abs/1710.09412 Based on the mixing ratio sampled from the Beta distribution, it is a method of
+    expanding data by mixing both input and output. By using this, it is said that generalization performance improves
+    because the decision boundary becomes smooth.
 
     Args:
         hidden_states: (`torch.FloatTensor` of shape (batch_size, seq_length, hidden_size))
@@ -65,7 +66,8 @@ def do_mixup(hidden_states, mixup_lambda):
             Mixing ratio sampled from the Beta distribution
     """
     out = (
-        hidden_states.transpose(0, -1) * mixup_lambda + torch.flip(hidden_states, dims=[0]).transpose(0, -1) * (1 - mixup_lambda)
+        hidden_states.transpose(0, -1) * mixup_lambda
+        + torch.flip(hidden_states, dims=[0]).transpose(0, -1) * (1 - mixup_lambda)
     ).transpose(0, -1)
     return out
 
@@ -73,8 +75,7 @@ def do_mixup(hidden_states, mixup_lambda):
 # Adapted from: https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L191
 def interpolate(hidden_states, ratio):
     """
-    Interpolate data in time domain. This is used to compensate the
-    resolution reduction in downsampling of a CNN.
+    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.
 
     Args:
         hidden_states: (`torch.FloatTensor` of shape (batch_size, time_steps, classes_num))
@@ -91,7 +92,8 @@ def interpolate(hidden_states, ratio):
 # Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L249
 def window_partition(hidden_states, window_size):
     """
-    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size, num_channels)`
+    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
+    num_channels)`
 
     Args:
         hidden_states: (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`)
@@ -100,8 +102,10 @@ def window_partition(hidden_states, window_size):
             Window size
     """
     batch_size, height, width, num_channels = hidden_states.shape
-    
-    hidden_states = hidden_states.view(batch_size, height // window_size, window_size, width // window_size, window_size, num_channels)
+
+    hidden_states = hidden_states.view(
+        batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
+    )
     windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
     return windows
 
@@ -191,21 +195,25 @@ class CLAPTextModelOutput(ModelOutput):
 class SwinEncoderOutput(ModelOutput):
     """
     Swin encoder's outputs, with potential hidden states and attentions.
+
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
             shape `(batch_size, sequence_length, hidden_size)`.
+
             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
+
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
             shape `(batch_size, hidden_size, height, width)`.
+
             Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
             include the spatial dimensions.
     """
@@ -279,8 +287,8 @@ def to_tuple(self) -> Tuple[Any]:
 # Adapted from transformers.models.swin.modeling_swin.SwinDropPath
 class CLAPDropPath(nn.Module):
     """
-    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a 
-    slightly refactored version of the `SwinDropPath` implementation.
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
+    refactored version of the `SwinDropPath` implementation.
     """
 
     def __init__(self, drop_prob=None):
@@ -400,7 +408,7 @@ def forward(self, hidden_states, is_longer_idx=None):
                 raise ValueError(
                     f"Input image size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
                 )
-            
+
             global_hidden_states = self.proj(global_hidden_states)
             output_width = global_hidden_states.size(-1)
             if len(is_longer_idx) > 0:
@@ -410,18 +418,38 @@ def forward(self, hidden_states, is_longer_idx=None):
 
                 local_hidden_states = local_hidden_states.view(batch_size * num_channels, 1, height, width)
                 local_hidden_states = self.mel_conv2d(local_hidden_states)
-                local_hidden_states = local_hidden_states.view(batch_size, num_channels, local_hidden_states.size(1), local_hidden_states.size(2), local_hidden_states.size(3))
+                local_hidden_states = local_hidden_states.view(
+                    batch_size,
+                    num_channels,
+                    local_hidden_states.size(1),
+                    local_hidden_states.size(2),
+                    local_hidden_states.size(3),
+                )
                 local_hidden_states = local_hidden_states.permute((0, 2, 3, 1, 4)).contiguous().flatten(3)
                 output_batch_size, output_num_channels, output_height, _ = local_hidden_states.size()
-                
+
                 if local_hidden_states.size(-1) < output_width:
                     local_hidden_states = torch.cat(
-                        [local_hidden_states, torch.zeros((output_batch_size, output_num_channels, output_height, output_width - local_hidden_states.size(-1)), device=global_hidden_states.device)], dim=-1
+                        [
+                            local_hidden_states,
+                            torch.zeros(
+                                (
+                                    output_batch_size,
+                                    output_num_channels,
+                                    output_height,
+                                    output_width - local_hidden_states.size(-1),
+                                ),
+                                device=global_hidden_states.device,
+                            ),
+                        ],
+                        dim=-1,
                     )
                 else:
                     local_hidden_states = local_hidden_states[:, :, :, :output_width]
 
-                global_hidden_states[is_longer_idx] = self.fusion_model(global_hidden_states[is_longer_idx], local_hidden_states)
+                global_hidden_states[is_longer_idx] = self.fusion_model(
+                    global_hidden_states[is_longer_idx], local_hidden_states
+                )
             hidden_states = global_hidden_states
         else:
             _, _, height, width = hidden_states.shape
@@ -620,8 +648,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->CLAPAudio, SwinDropPath->CLAPDropPath
-class CLAPAudioSwinLayer(nn.Module):
+# Copied from transformers.models.swin.modeling_swin.SwinLayer with SwinDropPath->CLAPDropPath, Swin->CLAPAudio
+class CLAPAudioLayer(nn.Module):
     def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
@@ -1955,25 +1983,24 @@ def forward(
         )
 
 
-# Copied from transformers.models.swin.modeling_swin with Swin->CLAPAudio
-class CLAPAudioLayer(nn.Module):
+# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->CLAPAudio
+class CLAPAudioStage(nn.Module):
     def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
         super().__init__()
         self.config = config
         self.dim = dim
-        self.input_resolution = input_resolution
         self.blocks = nn.ModuleList(
             [
-                CLAPAudioSwinLayer(
+                CLAPAudioLayer(
                     config=config,
                     dim=dim,
                     input_resolution=input_resolution,
                     num_heads=num_heads,
                     shift_size=0 if (i % 2 == 0) else config.window_size // 2,
-)
+                )
                 for i in range(depth)
-]
-    )
+            ]
+        )
 
         # patch merging layer
         if downsample is not None:
@@ -2021,6 +2048,7 @@ def forward(
 class CLAPAudioPatchMerging(nn.Module):
     """
     Patch Merging Layer.
+
     Args:
         input_resolution (`Tuple[int]`):
             Resolution of input feature.
@@ -2071,7 +2099,6 @@ def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]
         return input_feature
 
 
-
 class CLAPAudioEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -2090,9 +2117,11 @@ def __init__(self, config):
 
         dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
 
+        self.input_resolutions = [(grid_size[0] // (2**i), grid_size[1] // (2**i)) for i in range(self.num_layers)]
+
         self.layers = nn.ModuleList(
             [
-                CLAPAudioLayer(
+                CLAPAudioStage(
                     config=config,
                     dim=int(config.embed_dim * 2**i_layer),
                     input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
@@ -2100,7 +2129,7 @@ def __init__(self, config):
                     num_heads=config.num_heads[i_layer],
                     drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                     downsample=CLAPAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
-            )
+                )
                 for i_layer in range(self.num_layers)
             ]
         )
@@ -2113,7 +2142,6 @@ def __init__(self, config):
 
         self.avgpool = nn.AdaptiveAvgPool1d(1)
 
-
         SF = config.spec_size // (2 ** (len(config.depths) - 1)) // self.patch_embed.patch_stride[0] // self.freq_ratio
         self.tscam_conv = nn.Conv2d(
             in_channels=self.num_features, out_channels=config.num_classes, kernel_size=(SF, 3), padding=(0, 1)
@@ -2164,7 +2192,7 @@ def forward(
         self,
         input_features,
         head_mask: Optional[torch.FloatTensor] = None,
-        is_longer: Optional[torch.FloatTensor]=None,
+        is_longer: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         output_hidden_states_before_downsampling: Optional[bool] = False,
@@ -2185,13 +2213,14 @@ def forward(
 
         _, _, frames_num, _ = hidden_states.shape
 
-
         hidden_states = self.patch_embed(hidden_states, is_longer_list_idx)
 
         all_hidden_states = () if output_hidden_states else None
         all_reshaped_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
+        input_dimensions = None
+
         if output_hidden_states:
             batch_size, _, hidden_size = hidden_states.shape
             # rearrange b (h w) c -> b c h w
@@ -2203,7 +2232,7 @@ def forward(
         for i, layer_module in enumerate(self.layers):
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
-            input_dimensions = layer_module.input_resolution
+            input_dimensions = self.input_resolutions[i]
 
             if self.gradient_checkpointing and self.training:
 
@@ -2266,7 +2295,7 @@ def custom_forward(*inputs):
         hidden_states = hidden_states.reshape(batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp)
         hidden_states = (
             hidden_states.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
-    )
+        )
         # get latent_output
         fine_grained_latent_output = torch.mean(hidden_states, dim=2)
         fine_grained_latent_output = interpolate(
@@ -2296,4 +2325,4 @@ def custom_forward(*inputs):
             clipwise_output=torch.sigmoid(hidden_states),
             fine_grained_embedding=fine_grained_latent_output,
             embedding=latent_output,
-        )
\ No newline at end of file
+        )
diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
index 75b049b4cecd..aa4a396b72b8 100644
--- a/tests/models/clap/test_feature_extraction_clap.py
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -37,6 +37,7 @@
 
 global_rng = random.Random()
 
+
 # Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
 def floats_list(shape, scale=1.0, rng=None, name=None):
     """Creates a random float32 tensor"""
@@ -51,6 +52,7 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
 
     return values
 
+
 @require_torch
 @require_torchaudio
 # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester with Whisper->CLAP
@@ -225,7 +227,6 @@ def integration_test_rand_trunc(self):
         input_features = feaure_extractor(input_speech, return_tensors="pt").input_features
         self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
 
-
     def integration_test_fusion(self):
         # fmt: off
         EXPECTED_INPUT_FEATURES = torch.tensor(
@@ -240,7 +241,7 @@ def integration_test_fusion(self):
 
         input_speech = self._load_datasamples(1)
         feaure_extractor = CLAPFeatureExtractor()
-        input_features = feaure_extractor(input_speech, return_tensors="pt", truncation = "rand_trunc").input_features
+        input_features = feaure_extractor(input_speech, return_tensors="pt", truncation="rand_trunc").input_features
         self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
-        
-        # TODO test fusion with a longer audio
\ No newline at end of file
+
+        # TODO test fusion with a longer audio

From f958fa192d0f15f7c65b2f997f958ee3c4a1f96a Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 6 Feb 2023 09:46:04 +0000
Subject: [PATCH 046/197] make fixup

---
 docs/source/en/serialization.mdx              |  1 -
 src/transformers/models/clap/modeling_clap.py | 56 +++----------------
 2 files changed, 7 insertions(+), 50 deletions(-)

diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 2790a87ea6e0..7079a91f40c3 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -64,7 +64,6 @@ Ready-made configurations include the following architectures:
 - BLOOM
 - CamemBERT
 - Chinese-CLIP
-- clap
 - CLIP
 - CodeGen
 - Conditional DETR
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 88d9e1452737..bb9aa951012f 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1469,49 +1469,22 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return pooled_output
 
 
-class CLAPTextPreTrainedModel(PreTrainedModel):
+class CLAPPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
-    config_class = CLAPTextConfig
-    base_model_prefix = "claptext"
+    config_class = CLAPConfig
+    base_model_prefix = "clap"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"]
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, CLAPTextEncoder):
-            module.gradient_checkpointing = value
-
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
+        pass
 
 
-class CLAPTextModel(CLAPTextPreTrainedModel):
+class CLAPTextModel(CLAPPreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
@@ -1679,21 +1652,6 @@ def forward(
         )
 
 
-class CLAPPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = CLAPConfig
-    base_model_prefix = "clap"
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"]
-
-    def _init_weights(self, module):
-        pass
-
-
 @add_start_docstrings(CLAP_START_DOCSTRING)
 class CLAPModel(CLAPPreTrainedModel):
     config_class = CLAPConfig
@@ -2124,7 +2082,7 @@ def __init__(self, config):
                 CLAPAudioStage(
                     config=config,
                     dim=int(config.embed_dim * 2**i_layer),
-                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
+                    input_resolution=self.input_resolutions[i_layer],
                     depth=config.depths[i_layer],
                     num_heads=config.num_heads[i_layer],
                     drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],

From c218e3cbe8e80ae241a3dd4ba1a741848de7129f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 6 Feb 2023 09:47:01 +0000
Subject: [PATCH 047/197] fix feature extractor

---
 .../models/clap/feature_extraction_clap.py        | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 4bde80242d2e..9d9e1798ae73 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -234,9 +234,12 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
             longer = False
             # only use repeat as a new possible value for padding. you repeat the audio before applying the usual max_length padding
             if waveform.shape[0] < max_length:
+                if padding == "repeat":
+                    n_repeat = int(max_length / len(waveform))
+                    waveform = np.stack(np.tile(waveform, n_repeat + 1))[:max_length]
                 if padding == "repeatpad":
                     n_repeat = int(max_length / len(waveform))
-                    waveform = waveform.repeat(n_repeat)
+                    waveform = np.stack(np.tile(waveform, n_repeat))
                 waveform = np.pad(waveform, (0, max_length - waveform.shape[0]), mode="constant", constant_values=0)
 
             if truncation == "fusion":
@@ -251,10 +254,10 @@ def __call__(
         self,
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
         truncation: str = "fusion",
+        padding: Optional[str] = "repeatpad",
         pad_to_multiple_of: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_attention_mask: Optional[bool] = None,
-        padding: Optional[str] = "repeatpad",
         max_length: Optional[int] = None,
         sampling_rate: Optional[int] = None,
         **kwargs
@@ -319,11 +322,11 @@ def __call__(
         )
 
         if is_batched:
-            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
+            raw_speech = [np.asarray(speech, dtype=np.float64) for speech in raw_speech]
         elif not is_batched and not isinstance(raw_speech, np.ndarray):
-            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+            raw_speech = np.asarray(raw_speech, dtype=np.float64)
         elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
-            raw_speech = raw_speech.astype(np.float32)
+            raw_speech = raw_speech.astype(np.float64)
 
         # always return batch
         if not is_batched:
@@ -353,7 +356,7 @@ def __call__(
             is_longer[rand_idx] = True
 
         if isinstance(input_mel[0], List):
-            input_mel = [np.asarray(mel, dtype=np.float32) for feature in input_mel]
+            input_mel = [np.asarray(mel, dtype=np.float64) for feature in input_mel]
 
         input_features = {"input_features": input_mel, "is_longer": is_longer}
         input_features = BatchFeature(input_features)

From 65cfca45b3f51ef8ec0e247fd0272d4d821ef643 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 6 Feb 2023 09:47:30 +0000
Subject: [PATCH 048/197] update test feature extractor

---
 tests/models/clap/test_feature_extraction_clap.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
index 75b049b4cecd..de48522dc19e 100644
--- a/tests/models/clap/test_feature_extraction_clap.py
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -51,6 +51,7 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
 
     return values
 
+
 @require_torch
 @require_torchaudio
 # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester with Whisper->CLAP
@@ -225,7 +226,6 @@ def integration_test_rand_trunc(self):
         input_features = feaure_extractor(input_speech, return_tensors="pt").input_features
         self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
 
-
     def integration_test_fusion(self):
         # fmt: off
         EXPECTED_INPUT_FEATURES = torch.tensor(
@@ -240,7 +240,7 @@ def integration_test_fusion(self):
 
         input_speech = self._load_datasamples(1)
         feaure_extractor = CLAPFeatureExtractor()
-        input_features = feaure_extractor(input_speech, return_tensors="pt", truncation = "rand_trunc").input_features
+        input_features = feaure_extractor(input_speech, return_tensors="pt", truncation="rand_trunc").input_features
         self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
-        
-        # TODO test fusion with a longer audio
\ No newline at end of file
+
+        # TODO test fusion with a longer audio

From 50613fed68ef37aba29ebb09596c958b0103b2e5 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 6 Feb 2023 10:02:23 +0000
Subject: [PATCH 049/197] `make fixup`

---
 docs/source/en/serialization.mdx              |  1 -
 .../feature_extraction_sequence_utils.py      | 22 +++++++++----------
 utils/check_repo.py                           |  2 ++
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 85d3f6bafa21..7079a91f40c3 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -64,7 +64,6 @@ Ready-made configurations include the following architectures:
 - BLOOM
 - CamemBERT
 - Chinese-CLIP
-- CLAP
 - CLIP
 - CodeGen
 - Conditional DETR
diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 7bd9cc7a5c20..89d19dd88c54 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -370,13 +370,13 @@ def _get_padding_strategies(self, padding=False, max_length=None):
 
     @staticmethod
     def hz_to_mel(freq: float, mel_scale: str = "htk") -> float:
-        r"""Convert Hz to Mels.
+        """Convert Hz to Mels.
 
         Args:
             freqs (float):
                 Frequencies in Hz
             mel_scale (str, *optional*):
-                Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+                Scale to use: `htk` or `slaney`. (Default: `htk`)
 
         Returns:
             mels (float): Frequency in Mels
@@ -410,7 +410,7 @@ def mel_to_hz(mels: np.array, mel_scale: str = "htk") -> np.array:
 
         Args:
             mels (np.array): Mel frequencies
-            mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+            mel_scale (str, optional): Scale to use: `htk` or `slaney`. (Default: `htk`)
 
         Returns:
             freqs (np.array): Mels converted in Hz
@@ -473,12 +473,12 @@ def get_mel_filter_banks(
         norm: Optional[str] = None,
         mel_scale: str = "htk",
     ) -> np.array:
-        r"""
+        """
         Create a frequency bin conversion matrix used to obtain the Mel Frequency Cepstral Coefficient. This is called
-        a `mel filter bank`, and various implementation exist, which differ in the number of filters, the shape of the
+        a *mel filter bank*, and various implementation exist, which differ in the number of filters, the shape of the
         filters, the way the filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is
         warped. The goal of these features is to approximate the non-linear human perception of the variation in pitch
-        with respect to the frequency. This code is heavily inspired from the `torchaudio` implementation, refer to XXX
+        with respect to the frequency. This code is heavily inspired from the *torchaudio* implementation, refer to XXX
         for more details.
 
 
@@ -508,14 +508,14 @@ def get_mel_filter_banks(
                 Sample rate of the audio waveform
             norm (str or None, optional):
                 If "slaney", divide the triangular mel weights by the width of the mel band (area normalization).
-                (Default: ``None``)
+                (Default: `None`)
             mel_scale (str, optional):
-                Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+                Scale to use: `htk` or `slaney`. (Default: `htk`)
 
         Returns:
-            Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``) meaning number of frequencies
-            to highlight/apply to x the number of filterbanks. Each column is a filterbank so that assuming there is a
-            matrix A of size (..., ``n_freqs``), the applied result would be ``A * melscale_fbanks(A.size(-1), ...)``.
+            Tensor: Triangular filter banks (fb matrix) of size (`n_freqs`, `n_mels`) meaning number of frequencies to
+            highlight/apply to x the number of filterbanks. Each column is a filterbank so that assuming there is a
+            matrix A of size (..., `n_freqs`), the applied result would be `A * melscale_fbanks(A.size(-1), ...)`.
 
         """
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index d8756fe59df7..d8115d629e61 100755
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -157,6 +157,8 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
+    "CLAPTextModel",
+    "CLAPTextModelWithProjection",
     "GitVisionModel",
     "GraphormerModel",
     "GraphormerForGraphClassification",

From b1f46dc222d0b3e48aa0431f0b7c6c599be39c55 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 6 Feb 2023 10:13:24 +0000
Subject: [PATCH 050/197] clean up config

---
 .../models/clap/configuration_clap.py         | 31 -------------------
 1 file changed, 31 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 74b057b424df..a7b56610c942 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -245,24 +245,10 @@ def __init__(
         enable_patch_layer_norm=True,
         drop_path_rate=0.0,
         attention_probs_dropout_prob=0.0,
-        swin_drop_path_rate=0.1,
         qkv_bias=True,
-        swin_norm_before_mlp="ln",
         mlp_ratio=4.0,
-        swin_use_checkpoint=False,
-        swin_absolute_positional_embedding=False,
-        swin_hidden_act="gelu",
         aff_block_r=4,
         enable_patch_fusion=False,
-        spectrogram_window_size=1024,
-        spectrogram_freeze_parameters=True,
-        spectrogram_ref=1.0,
-        spectrogram_amin=1e-10,
-        spectrogram_top_db=None,
-        spectrogram_time_drop_width=64,
-        spectrogram_time_stripes_num=2,
-        spectrogram_freq_drop_width=8,
-        spectrogram_freq_stripes_num=2,
         layer_norm_eps=1e-5,
         **kwargs
     ):
@@ -296,28 +282,11 @@ def __init__(
         self.enable_patch_layer_norm = enable_patch_layer_norm
         self.drop_path_rate = drop_path_rate
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.swin_drop_path_rate = swin_drop_path_rate
         self.qkv_bias = qkv_bias
-        self.swin_norm_before_mlp = swin_norm_before_mlp
         self.mlp_ratio = mlp_ratio
-        self.swin_use_checkpoint = swin_use_checkpoint
-        self.swin_absolute_positional_embedding = swin_absolute_positional_embedding
         self.patch_embed_input_channels = patch_embed_input_channels
-        self.swin_hidden_act = swin_hidden_act
         self.aff_block_r = aff_block_r
         self.enable_patch_fusion = enable_patch_fusion
-        self.spectrogram_window_size = spectrogram_window_size
-        # self.spectrogram_window = spectrogram_window
-        # self.spectrogram_center = spectrogram_center
-        # self.spectrogram_pad_mode = spectrogram_pad_mode
-        self.spectrogram_freeze_parameters = spectrogram_freeze_parameters
-        self.spectrogram_ref = spectrogram_ref
-        self.spectrogram_amin = spectrogram_amin
-        self.spectrogram_top_db = spectrogram_top_db
-        self.spectrogram_time_drop_width = spectrogram_time_drop_width
-        self.spectrogram_time_stripes_num = spectrogram_time_stripes_num
-        self.spectrogram_freq_drop_width = spectrogram_freq_drop_width
-        self.spectrogram_freq_stripes_num = spectrogram_freq_stripes_num
         self.layer_norm_eps = layer_norm_eps
 
     @classmethod

From 51150572b9c2623ae6ac3109a618c12b58b1e068 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 6 Feb 2023 10:16:04 +0000
Subject: [PATCH 051/197] more clean up

---
 src/transformers/models/clap/configuration_clap.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index a7b56610c942..58bf92aeb02b 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -216,14 +216,8 @@ class CLAPAudioConfig(PretrainedConfig):
 
     def __init__(
         self,
-        sample_rate=48000,
-        audio_length=1024,
         window_size=8,
-        hop_size=1024,
-        fmin=50,
-        fmax=14000,
         mel_bins=64,
-        clip_samples=480000,
         spec_size=256,
         hidden_act="gelu",
         patch_size=4,
@@ -253,14 +247,8 @@ def __init__(
         **kwargs
     ):
         super().__init__(**kwargs)
-        self.sample_rate = sample_rate
-        self.audio_length = audio_length
         self.window_size = window_size
-        self.hop_size = hop_size
-        self.fmin = fmin
-        self.fmax = fmax
         self.mel_bins = mel_bins
-        self.clip_samples = clip_samples
         self.spec_size = spec_size
         self.patch_size = patch_size
         self.patch_stride = patch_stride

From 833c0612e6b588d8e284d99ae9dc4e6d2ceb26b0 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 6 Feb 2023 10:22:01 +0000
Subject: [PATCH 052/197] more cleanup

---
 src/transformers/models/clap/modeling_clap.py | 31 +------------------
 1 file changed, 1 insertion(+), 30 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index bb9aa951012f..2ee4d7cdcc6f 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -772,36 +772,6 @@ def forward(
         return layer_outputs
 
 
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->CLAP
-class CLAPVisionModelOutput(ModelOutput):
-    """
-    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
-
-    Args:
-        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
-            The image embeddings obtained by applying the projection layer to the pooler_output.
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
 CLAP_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -950,6 +920,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->CLAPText
 class CLAPTextEmbeddings(nn.Module):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.

From 8ee3051dbe3e5785adab13da6835c711eb5770d5 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 6 Feb 2023 13:22:34 +0000
Subject: [PATCH 053/197] update tests

---
 .../clap/test_feature_extraction_clap.py      | 103 +++++++++++++++---
 1 file changed, 88 insertions(+), 15 deletions(-)

diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
index aa4a396b72b8..9771f9845b14 100644
--- a/tests/models/clap/test_feature_extraction_clap.py
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -210,38 +210,111 @@ def _load_datasamples(self, num_samples):
 
         return [x["array"] for x in speech_samples]
 
-    def integration_test_rand_trunc(self):
+    def integration_test_fusion(self):
         # fmt: off
         EXPECTED_INPUT_FEATURES = torch.tensor(
             [
-                0.1193, -0.0946, -0.1098, -0.0196, 0.0225, -0.0690, -0.1736, 0.0951,
-                0.0971, -0.0817, -0.0702, 0.0162, 0.0260, 0.0017, -0.0192, -0.1678,
-                0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554,
-                -0.0274, -0.1425, -0.1423, 0.0837, 0.0377, -0.0854
+                [
+                    -30.2194, -22.4424, -18.6442, -17.2452, -22.7392, -32.2576, -36.1404,
+                    -35.6120, -29.6229, -29.0454, -32.2157, -36.7664, -29.4436, -26.7825,
+                    -31.1811, -38.3918, -38.8749, -43.4485, -47.6236, -38.7528, -31.8574,
+                    -39.0591, -41.3190, -32.3319, -31.4699, -33.4502, -36.7412, -34.5265,
+                    -35.1091, -40.4518, -42.7346, -44.5909, -44.9747, -45.8328, -47.0772,
+                    -46.2723, -44.3613, -48.6253, -44.9551, -43.8700, -44.6104, -48.0146,
+                    -42.7614, -47.3587, -47.4369, -45.5018, -47.0198, -42.8759, -47.5056,
+                    -47.1567, -49.2621, -49.5643, -48.4330, -48.8495, -47.2512, -40.8439,
+                    -48.1234, -49.1218, -48.7222, -50.2399, -46.8487, -41.9921, -50.4015,
+                    -50.7827
+                ],
+                [
+                    -89.0141,  -89.1411,  -88.8096,  -88.5480,  -88.3481,  -88.2038,
+                    -88.1105,  -88.0647,  -88.0636,  -88.1051,  -88.1877,  -88.1110,
+                    -87.8613,  -88.6679,  -88.2685,  -88.9684,  -88.7977,  -89.6264,
+                    -89.9299,  -90.3184,  -91.1446,  -91.9265,  -92.7267,  -93.6099,
+                    -94.6395,  -95.3243,  -95.5923,  -95.5773,  -95.0889,  -94.3354,
+                    -93.5746,  -92.9287,  -92.4525,  -91.9798,  -91.8852,  -91.7500,
+                    -91.7259,  -91.7561,  -91.7959,  -91.7070,  -91.6914,  -91.5019,
+                    -91.0640,  -90.0807,  -88.7102,  -87.0826,  -85.5956,  -84.4441,
+                    -83.8461,  -83.8605,  -84.6702,  -86.3900,  -89.3073,  -93.2926,
+                    -96.3813,  -97.3529, -100.0000,  -99.6942,  -92.2851,  -87.9588,
+                    -85.7214,  -84.6807,  -84.1940,  -84.2021
+                ],
+                [
+                    -51.6882, -50.6852, -50.8198, -51.7428, -53.0325, -54.1619, -56.4903,
+                    -59.0314, -60.7996, -60.5164, -59.9680, -60.5393, -62.5796, -65.4166,
+                    -65.6149, -65.1409, -65.7226, -67.9057, -72.5089, -82.3530, -86.3189,
+                    -83.4241, -79.1279, -79.3384, -82.7335, -79.8316, -80.2167, -74.3638,
+                    -71.3930, -75.3849, -74.5381, -71.4504, -70.3791, -71.4547, -71.8820,
+                    -67.3885, -69.5686, -71.9852, -71.0307, -73.0053, -80.8802, -72.9227,
+                    -63.8526, -60.3260, -59.6012, -57.8316, -61.0603, -67.3403, -67.1709,
+                    -60.4967, -60.5079, -68.3345, -67.5213, -70.6416, -79.6219, -78.2198,
+                    -74.6851, -69.5718, -69.4968, -70.6882, -66.8175, -73.8558, -74.3855,
+                    -72.9405
+                ]
             ]
         )
         # fmt: on
-
+        MEL_BIN = [963, 963, 161]
         input_speech = self._load_datasamples(1)
         feaure_extractor = CLAPFeatureExtractor()
-        input_features = feaure_extractor(input_speech, return_tensors="pt").input_features
-        self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
+        for padding, EXPECTED_VALUES, idx_in_mel in zip(
+            ["repeat", "repeatpad", None], EXPECTED_INPUT_FEATURES, MEL_BIN
+        ):
 
-    def integration_test_fusion(self):
+            input_features = feaure_extractor(input_speech, return_tensors="pt", padding=padding).input_features
+            self.assertTrue(torch.allclose(input_features[0, idx_in_mel], EXPECTED_VALUES, atol=1e-4))
+
+    def integration_test_rand_trunc(self):
+        # TODO in this case we should set the seed and use a longer audio to properly see the random truncation
         # fmt: off
         EXPECTED_INPUT_FEATURES = torch.tensor(
             [
-                0.1193, -0.0946, -0.1098, -0.0196, 0.0225, -0.0690, -0.1736, 0.0951,
-                0.0971, -0.0817, -0.0702, 0.0162, 0.0260, 0.0017, -0.0192, -0.1678,
-                0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554,
-                -0.0274, -0.1425, -0.1423, 0.0837, 0.0377, -0.0854
+                [
+                    -42.3330, -36.2735, -35.9231, -43.5947, -48.4525, -46.5227, -42.6477,
+                    -47.2740, -51.4336, -50.0846, -51.8711, -50.4232, -47.4736, -54.2275,
+                    -53.3947, -55.4904, -54.8750, -54.5510, -55.4156, -57.4395, -51.7385,
+                    -55.9118, -57.7800, -63.2064, -67.0651, -61.4379, -56.4268, -54.8667,
+                    -52.3487, -56.4418, -57.1842, -55.1005, -55.6366, -59.4395, -56.8604,
+                    -56.4949, -61.6573, -61.0826, -60.3250, -63.7876, -67.4882, -60.2323,
+                    -54.6886, -50.5369, -47.7656, -45.8909, -49.1273, -57.4141, -58.3201,
+                    -51.9862, -51.4897, -59.2561, -60.4730, -61.2203, -69.3174, -69.7464,
+                    -65.5861, -58.9921, -59.5610, -61.0584, -58.1149, -64.4045, -66.2622,
+                    -64.4610
+                ],
+                [
+                    -41.2298, -38.4211, -39.8834, -45.9950, -47.3839, -43.9849, -46.0371,
+                    -52.5490, -56.6912, -51.8794, -50.1284, -49.7506, -53.9422, -63.2854,
+                    -56.5754, -55.0469, -55.3181, -55.8115, -56.0058, -57.9215, -58.7597,
+                    -59.1994, -59.2141, -64.4198, -73.5138, -64.4647, -59.3351, -54.5626,
+                    -54.7508, -65.0230, -60.0270, -54.7644, -56.0108, -60.1531, -57.6879,
+                    -56.3766, -63.3395, -65.3032, -61.5202, -63.0677, -68.4217, -60.6868,
+                    -54.4619, -50.8533, -47.7200, -45.9197, -49.0961, -57.7621, -59.0750,
+                    -51.9122, -51.4332, -59.4132, -60.3415, -61.6558, -70.7049, -69.7905,
+                    -66.9104, -59.0324, -59.6138, -61.2023, -58.2169, -65.3837, -66.4425,
+                    -64.4142
+                ],
+                [
+                    -51.6882, -50.6852, -50.8198, -51.7428, -53.0325, -54.1619, -56.4903,
+                    -59.0314, -60.7996, -60.5164, -59.9680, -60.5393, -62.5796, -65.4166,
+                    -65.6149, -65.1409, -65.7226, -67.9057, -72.5089, -82.3530, -86.3189,
+                    -83.4241, -79.1279, -79.3384, -82.7335, -79.8316, -80.2167, -74.3638,
+                    -71.3930, -75.3849, -74.5381, -71.4504, -70.3791, -71.4547, -71.8820,
+                    -67.3885, -69.5686, -71.9852, -71.0307, -73.0053, -80.8802, -72.9227,
+                    -63.8526, -60.3260, -59.6012, -57.8316, -61.0603, -67.3403, -67.1709,
+                    -60.4967, -60.5079, -68.3345, -67.5213, -70.6416, -79.6219, -78.2198,
+                    -74.6851, -69.5718, -69.4968, -70.6882, -66.8175, -73.8558, -74.3855,
+                    -72.9405
+                ]
             ]
         )
         # fmt: on
 
         input_speech = self._load_datasamples(1)
         feaure_extractor = CLAPFeatureExtractor()
-        input_features = feaure_extractor(input_speech, return_tensors="pt", truncation="rand_trunc").input_features
-        self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
+        for padding, EXPECTED_VALUES in zip(["repeat", "repeatpad", None], EXPECTED_INPUT_FEATURES):
+            input_features = feaure_extractor(
+                input_speech, return_tensors="pt", truncation="rand_trunc", padding=padding
+            ).input_features
+            self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_VALUES, atol=1e-4))
 
         # TODO test fusion with a longer audio

From e472482b9521455cffc4b9c9e41ace5893db569a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 6 Feb 2023 14:28:24 +0000
Subject: [PATCH 054/197] refactor tests and inits

---
 docs/source/en/model_doc/clap.mdx             |  13 +-
 src/transformers/__init__.py                  |  12 +-
 src/transformers/models/clap/__init__.py      |  12 +-
 .../models/clap/configuration_clap.py         |   8 +-
 src/transformers/models/clap/modeling_clap.py |  12 +-
 src/transformers/utils/dummy_pt_objects.py    |   4 +-
 tests/models/clap/test_modeling_clap.py       |  26 +--
 tests/models/clap/test_processor_clap.py      | 161 ++++++------------
 8 files changed, 89 insertions(+), 159 deletions(-)

diff --git a/docs/source/en/model_doc/clap.mdx b/docs/source/en/model_doc/clap.mdx
index 5e49090a0ebd..b9c19a0b6682 100644
--- a/docs/source/en/model_doc/clap.mdx
+++ b/docs/source/en/model_doc/clap.mdx
@@ -54,11 +54,6 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 [[autodoc]] CLAPTokenizerFast
 
-## CLAPImageProcessor
-
-[[autodoc]] CLAPImageProcessor
-    - preprocess
-
 ## CLAPFeatureExtractor
 
 [[autodoc]] CLAPFeatureExtractor
@@ -84,13 +79,13 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 [[autodoc]] CLAPTextModelWithProjection
     - forward
 
-## CLAPVisionModelWithProjection
+## CLAPAudioModelWithProjection
 
-[[autodoc]] CLAPVisionModelWithProjection
+[[autodoc]] CLAPAudioModelWithProjection
     - forward
 
 
-## CLAPVisionModel
+## CLAPAudioModel
 
-[[autodoc]] CLAPVisionModel
+[[autodoc]] CLAPAudioModel
     - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 9106816930c6..96cd52de4a51 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -208,6 +208,7 @@
         "CLAPAudioConfig",
         "CLAPConfig",
         "CLAPProcessor",
+        "CLAPFeatureExtractor"
         "CLAPTextConfig",
         "CLAPTokenizer",
     ],
@@ -809,7 +810,6 @@
     _import_structure["models.blip"].extend(["BlipImageProcessor"])
     _import_structure["models.bridgetower"].append("BridgeTowerImageProcessor")
     _import_structure["models.chinese_clip"].extend(["ChineseCLIPFeatureExtractor", "ChineseCLIPImageProcessor"])
-    _import_structure["models.clap"].extend(["CLAPFeatureExtractor", "CLAPImageProcessor"])
     _import_structure["models.clip"].extend(["CLIPFeatureExtractor", "CLIPImageProcessor"])
     _import_structure["models.conditional_detr"].extend(
         ["ConditionalDetrFeatureExtractor", "ConditionalDetrImageProcessor"]
@@ -1237,8 +1237,8 @@
             "CLAPPreTrainedModel",
             "CLAPTextModel",
             "CLAPTextModelWithProjection",
-            "CLAPVisionModel",
-            "CLAPVisionModelWithProjection",
+            "CLAPAudioModel",
+            "CLAPAudioModelWithProjection",
         ]
     )
     _import_structure["models.clip"].extend(
@@ -3703,6 +3703,7 @@
         CLAPAudioConfig,
         CLAPConfig,
         CLAPProcessor,
+        CLAPFeatureExtractor,
         CLAPTextConfig,
         CLAPTokenizer,
     )
@@ -4236,7 +4237,6 @@
         from .models.blip import BlipImageProcessor
         from .models.bridgetower import BridgeTowerImageProcessor
         from .models.chinese_clip import ChineseCLIPFeatureExtractor, ChineseCLIPImageProcessor
-        from .models.clap import CLAPFeatureExtractor, CLAPImageProcessor
         from .models.clip import CLIPFeatureExtractor, CLIPImageProcessor
         from .models.conditional_detr import ConditionalDetrFeatureExtractor, ConditionalDetrImageProcessor
         from .models.convnext import ConvNextFeatureExtractor, ConvNextImageProcessor
@@ -4598,8 +4598,8 @@
             CLAPPreTrainedModel,
             CLAPTextModel,
             CLAPTextModelWithProjection,
-            CLAPVisionModel,
-            CLAPVisionModelWithProjection,
+            CLAPAudioModel,
+            CLAPAudioModelWithProjection,
         )
         from .models.clip import (
             CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index e6dd2d384758..d139567c222d 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -53,7 +53,6 @@
     pass
 else:
     _import_structure["feature_extraction_clap"] = ["CLAPFeatureExtractor"]
-    _import_structure["image_processing_clap"] = ["CLAPImageProcessor"]
 
 try:
     if not is_torch_available():
@@ -67,8 +66,8 @@
         "CLAPPreTrainedModel",
         "CLAPTextModel",
         "CLAPTextModelWithProjection",
-        "CLAPVisionModel",
-        "CLAPVisionModelWithProjection",
+        "CLAPAudioModel",
+        "CLAPAudioModelWithProjection",
     ]
 
 if TYPE_CHECKING:
@@ -91,13 +90,12 @@
         from .tokenization_clap_fast import CLAPTokenizerFast
 
     try:
-        if not is_vision_available():
+        if not is_torchvision_available(): #TODO this depencie will be removed
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         pass
     else:
         from .feature_extraction_clap import CLAPFeatureExtractor
-        from .image_processing_clap import CLAPImageProcessor
 
     try:
         if not is_torch_available():
@@ -111,8 +109,8 @@
             CLAPPreTrainedModel,
             CLAPTextModel,
             CLAPTextModelWithProjection,
-            CLAPVisionModel,
-            CLAPVisionModelWithProjection,
+            CLAPAudioModel,
+            CLAPAudioModelWithProjection,
         )
 
 else:
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 58bf92aeb02b..578ec609704f 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -162,7 +162,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
 class CLAPAudioConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`CLAPVisionModel`]. It is used to instantiate a
+    This is the configuration class to store the configuration of a [`CLAPAudioModel`]. It is used to instantiate a
     CLAP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLAP
     [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
@@ -200,13 +200,13 @@ class CLAPAudioConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import CLAPAudioConfig, CLAPVisionModel
+    >>> from transformers import CLAPAudioConfig, CLAPAudioModel
 
     >>> # Initializing a CLAPAudioConfig with laion-ai/base style configuration
     >>> configuration = CLAPAudioConfig()
 
-    >>> # Initializing a CLAPVisionModel (with random weights) from the laion-ai/base style configuration
-    >>> model = CLAPVisionModel(configuration)
+    >>> # Initializing a CLAPAudioModel (with random weights) from the laion-ai/base style configuration
+    >>> model = CLAPAudioModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 2ee4d7cdcc6f..44a5f3d5ce9f 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -262,11 +262,11 @@ class CLAPOutput(ModelOutput):
         text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
             The text embeddings obtained by applying the projection layer to the pooled output of [`CLAPTextModel`].
         image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of [`CLAPVisionModel`].
+            The image embeddings obtained by applying the projection layer to the pooled output of [`CLAPAudioModel`].
         text_model_output(`BaseModelOutputWithPooling`):
             The output of the [`CLAPTextModel`].
         vision_model_output(`BaseModelOutputWithPooling`):
-            The output of the [`CLAPVisionModel`].
+            The output of the [`CLAPAudioModel`].
     """
 
     loss: Optional[torch.FloatTensor] = None
@@ -819,11 +819,11 @@ def forward(
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
-CLAP_VISION_INPUTS_DOCSTRING = r"""
+CLAP_AUDIO_INPUTS_DOCSTRING = r"""
     Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLAPImageProcessor.__call__`] for details.
+            [`AutoFeatureExtractor`]. See [`CLAPFeatureExtractor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -858,7 +858,7 @@ def forward(
             [What are position IDs?](../glossary#position-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLAPImageProcessor.__call__`] for details.
+            [`AutoFeatureExtractor`]. See [`CLAPFeatureExtractor.__call__`] for details.
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
         output_attentions (`bool`, *optional*):
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index b9864bed753c..eba7f792eb65 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1465,14 +1465,14 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CLAPVisionModel(metaclass=DummyObject):
+class CLAPAudioModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CLAPVisionModelWithProjection(metaclass=DummyObject):
+class CLAPAudioModelWithProjection(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 53465ca46730..683b0f744426 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -53,8 +53,8 @@
         CLAPModel,
         CLAPTextModel,
         CLAPTextModelWithProjection,
-        CLAPVisionModel,
-        CLAPVisionModelWithProjection,
+        CLAPAudioModel,
+        CLAPAudioModelWithProjection,
     )
     from transformers.models.clap.modeling_clap import CLAP_PRETRAINED_MODEL_ARCHIVE_LIST
 
@@ -73,7 +73,7 @@
     )
 
 
-class CLAPVisionModelTester:
+class CLAPAudioModelTester:
     def __init__(
         self,
         parent,
@@ -134,7 +134,7 @@ def get_config(self):
         )
 
     def create_and_check_model(self, config, pixel_values):
-        model = CLAPVisionModel(config=config)
+        model = CLAPAudioModel(config=config)
         model.to(torch_device)
         model.eval()
         with torch.no_grad():
@@ -147,7 +147,7 @@ def create_and_check_model(self, config, pixel_values):
         self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
 
     def create_and_check_model_with_projection(self, config, pixel_values):
-        model = CLAPVisionModelWithProjection(config=config)
+        model = CLAPAudioModelWithProjection(config=config)
         model.to(torch_device)
         model.eval()
         with torch.no_grad():
@@ -167,20 +167,20 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class CLAPVisionModelTest(ModelTesterMixin, unittest.TestCase):
+class CLAPAudioModelTest(ModelTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as CLAP does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
-    all_model_classes = (CLAPVisionModel, CLAPVisionModelWithProjection) if is_torch_available() else ()
+    all_model_classes = (CLAPAudioModel, CLAPAudioModelWithProjection) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
 
     def setUp(self):
-        self.model_tester = CLAPVisionModelTester(self)
+        self.model_tester = CLAPAudioModelTester(self)
         self.config_tester = ConfigTester(self, config_class=CLAPAudioConfig, has_text_modality=False, hidden_size=37)
 
     def test_config(self):
@@ -225,24 +225,24 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(reason="CLAPVisionModel has no base class and is not available in MODEL_MAPPING")
+    @unittest.skip(reason="CLAPAudioModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip(reason="CLAPVisionModel has no base class and is not available in MODEL_MAPPING")
+    @unittest.skip(reason="CLAPAudioModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_to_base(self):
         pass
 
     @slow
     def test_model_from_pretrained(self):
         for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = CLAPVisionModel.from_pretrained(model_name)
+            model = CLAPAudioModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
     @slow
     def test_model_with_projection_from_pretrained(self):
         for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = CLAPVisionModelWithProjection.from_pretrained(model_name)
+            model = CLAPAudioModelWithProjection.from_pretrained(model_name)
             self.assertIsNotNone(model)
             self.assertTrue(hasattr(model, "visual_projection"))
 
@@ -410,7 +410,7 @@ def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=Tru
 
         self.parent = parent
         self.text_model_tester = CLAPTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = CLAPVisionModelTester(parent, **vision_kwargs)
+        self.vision_model_tester = CLAPAudioModelTester(parent, **vision_kwargs)
         self.is_training = is_training
 
     def prepare_config_and_inputs(self):
diff --git a/tests/models/clap/test_processor_clap.py b/tests/models/clap/test_processor_clap.py
index 2aa90ea58175..1e8fae168705 100644
--- a/tests/models/clap/test_processor_clap.py
+++ b/tests/models/clap/test_processor_clap.py
@@ -22,134 +22,89 @@
 import pytest
 
 from transformers import CLAPTokenizer, CLAPTokenizerFast
-from transformers.models.clap.tokenization_clap import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_vision
-from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+from transformers.utils import is_torchvision_available
+from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
+from .test_feature_extraction_clap import floats_list
 
 
-if is_vision_available():
-    from PIL import Image
+if is_torchvision_available():
+    from transformers import CLAPFeatureExtractor, CLAPProcessor
 
-    from transformers import CLAPImageProcessor, CLAPProcessor
 
+TRANSCRIBE = 50358
+NOTIMESTAMPS = 50362
 
-@require_vision
+
+
+@require_torchaudio
+@require_sentencepiece
 class CLAPProcessorTest(unittest.TestCase):
     def setUp(self):
+        self.checkpoint = "laionai/clap-tiny-hsat"
         self.tmpdirname = tempfile.mkdtemp()
 
-        # fmt: off
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]
-        # fmt: on
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-        image_processor_map = {
-            "do_resize": True,
-            "size": 20,
-            "do_center_crop": True,
-            "crop_size": 18,
-            "do_normalize": True,
-            "image_mean": [0.48145466, 0.4578275, 0.40821073],
-            "image_std": [0.26862954, 0.26130258, 0.27577711],
-        }
-        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
-        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
-            json.dump(image_processor_map, fp)
-
     def get_tokenizer(self, **kwargs):
-        return CLAPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+        return CLAPTokenizer.from_pretrained(self.checkpoint, **kwargs)
 
-    def get_rust_tokenizer(self, **kwargs):
-        return CLAPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_image_processor(self, **kwargs):
-        return CLAPImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
+    def get_feature_extractor(self, **kwargs):
+        return CLAPFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
 
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        return image_inputs
-
     def test_save_load_pretrained_default(self):
-        tokenizer_slow = self.get_tokenizer()
-        tokenizer_fast = self.get_rust_tokenizer()
-        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
 
-        processor_slow = CLAPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
-        processor_slow.save_pretrained(self.tmpdirname)
-        processor_slow = CLAPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+        processor = CLAPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
-        processor_fast = CLAPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
-        processor_fast.save_pretrained(self.tmpdirname)
-        processor_fast = CLAPProcessor.from_pretrained(self.tmpdirname)
+        processor.save_pretrained(self.tmpdirname)
+        processor = CLAPProcessor.from_pretrained(self.tmpdirname)
 
-        self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab())
-        self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab())
-        self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab())
-        self.assertIsInstance(processor_slow.tokenizer, CLAPTokenizer)
-        self.assertIsInstance(processor_fast.tokenizer, CLAPTokenizerFast)
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, CLAPTokenizer)
 
-        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor_slow.image_processor, CLAPImageProcessor)
-        self.assertIsInstance(processor_fast.image_processor, CLAPImageProcessor)
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, CLAPFeatureExtractor)
 
     def test_save_load_pretrained_additional_features(self):
-        processor = CLAPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+        processor = CLAPProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
         processor.save_pretrained(self.tmpdirname)
 
         tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
 
         processor = CLAPProcessor.from_pretrained(
             self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
         )
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, CLAPTokenizerFast)
+        self.assertIsInstance(processor.tokenizer, CLAPTokenizer)
 
-        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, CLAPImageProcessor)
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, CLAPFeatureExtractor)
 
-    def test_image_processor(self):
-        image_processor = self.get_image_processor()
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()
 
-        processor = CLAPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = CLAPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
-        image_input = self.prepare_image_inputs()
+        raw_speech = floats_list((3, 1000))
 
-        input_image_proc = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
+        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
+        input_processor = processor(raw_speech, return_tensors="np")
 
-        for key in input_image_proc.keys():
-            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
-        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()
 
-        processor = CLAPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = CLAPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
-        input_str = "lower newer"
+        input_str = "This is a test string"
 
         encoded_processor = processor(text=input_str)
 
@@ -158,28 +113,11 @@ def test_tokenizer(self):
         for key in encoded_tok.keys():
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
-    def test_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = CLAPProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"])
-
-        # test if it raises when no input is passed
-        with pytest.raises(ValueError):
-            processor()
-
     def test_tokenizer_decode(self):
-        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()
 
-        processor = CLAPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = CLAPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
         predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
 
@@ -189,14 +127,13 @@ def test_tokenizer_decode(self):
         self.assertListEqual(decoded_tok, decoded_processor)
 
     def test_model_input_names(self):
-        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()
 
-        processor = CLAPProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
+        processor = CLAPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(list(inputs.keys()), processor.model_input_names)
+        self.assertListEqual(
+            processor.model_input_names,
+            feature_extractor.model_input_names,
+            msg="`processor` and `feature_extractor` model input names do not match",
+        )

From b0f63c2168f92e044220789b45de6c1e85ea423a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 6 Feb 2023 14:29:52 +0000
Subject: [PATCH 055/197] removeCLAP vision config

---
 src/transformers/models/clap/configuration_clap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 578ec609704f..860d10de26d8 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -334,7 +334,7 @@ class CLAPConfig(PretrainedConfig):
     >>> # We can also initialize a CLAPConfig from a CLAPTextConfig and a CLAPAudioConfig
     >>> from transformers import CLAPTextConfig, CLAPAudioConfig
 
-    >>> # Initializing a CLAPText and CLAPVision configuration
+    >>> # Initializing a CLAPText and CLAPAudioConfig configuration
     >>> config_text = CLAPTextConfig()
     >>> config_vision = CLAPAudioConfig()
 

From 21b2b94f80d48a2952aa207d63575f3cef281b3d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 6 Feb 2023 14:36:43 +0000
Subject: [PATCH 056/197] remove CLAP from image procssing auto and dummy
 vision objects

---
 .../models/auto/image_processing_auto.py          |  1 -
 src/transformers/utils/dummy_vision_objects.py    | 15 ---------------
 2 files changed, 16 deletions(-)

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 4c1a2112508d..8ecdbdc87f6c 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -42,7 +42,6 @@
         ("blip", "BlipImageProcessor"),
         ("bridgetower", "BridgeTowerImageProcessor"),
         ("chinese_clip", "ChineseCLIPImageProcessor"),
-        ("clap", "CLAPImageProcessor"),
         ("clip", "CLIPImageProcessor"),
         ("clipseg", "ViTImageProcessor"),
         ("conditional_detr", "ConditionalDetrImageProcessor"),
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 822463ae3d39..7478cb3db25b 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -65,21 +65,6 @@ class ChineseCLIPImageProcessor(metaclass=DummyObject):
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
-
-class CLAPFeatureExtractor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
-class CLAPImageProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class CLIPFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 

From 506772e6b5be0f72620d82d813e4871c83503df7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 6 Feb 2023 14:42:26 +0000
Subject: [PATCH 057/197] update inits

---
 src/transformers/__init__.py             |  4 --
 src/transformers/models/clap/__init__.py | 47 +++---------------------
 2 files changed, 5 insertions(+), 46 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 96cd52de4a51..81cc07856fca 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -210,7 +210,6 @@
         "CLAPProcessor",
         "CLAPFeatureExtractor"
         "CLAPTextConfig",
-        "CLAPTokenizer",
     ],
     "models.clip": [
         "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -675,7 +674,6 @@
     _import_structure["models.blenderbot_small"].append("BlenderbotSmallTokenizerFast")
     _import_structure["models.bloom"].append("BloomTokenizerFast")
     _import_structure["models.camembert"].append("CamembertTokenizerFast")
-    _import_structure["models.clap"].append("CLAPTokenizerFast")
     _import_structure["models.clip"].append("CLIPTokenizerFast")
     _import_structure["models.codegen"].append("CodeGenTokenizerFast")
     _import_structure["models.convbert"].append("ConvBertTokenizerFast")
@@ -3705,7 +3703,6 @@
         CLAPProcessor,
         CLAPFeatureExtractor,
         CLAPTextConfig,
-        CLAPTokenizer,
     )
     from .models.clip import (
         CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4133,7 +4130,6 @@
         from .models.blenderbot_small import BlenderbotSmallTokenizerFast
         from .models.bloom import BloomTokenizerFast
         from .models.camembert import CamembertTokenizerFast
-        from .models.clap import CLAPTokenizerFast
         from .models.clip import CLIPTokenizerFast
         from .models.codegen import CodeGenTokenizerFast
         from .models.convbert import ConvBertTokenizerFast
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index d139567c222d..d8b47df7e9b1 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -17,13 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_tokenizers_available,
-    is_torch_available,
-    is_vision_available,
-)
+from ...utils import OptionalDependencyNotAvailable,_LazyModule,is_torch_available
 
 
 _import_structure = {
@@ -34,26 +28,10 @@
         "CLAPOnnxConfig",
         "CLAPTextConfig",
     ],
-    "processing_clap": ["CLAPProcessor"],
-    "tokenization_clap": ["CLAPTokenizer"],
+    "feature_extraction_clap": ["CLAPFeatureExtractor"],
+    "processing_clap": ["CLAPProcessor"]
 }
 
-try:
-    if not is_tokenizers_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["tokenization_clap_fast"] = ["CLAPTokenizerFast"]
-
-try:
-    if not is_vision_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["feature_extraction_clap"] = ["CLAPFeatureExtractor"]
-
 try:
     if not is_torch_available():
         raise OptionalDependencyNotAvailable()
@@ -80,23 +58,8 @@
     )
     from .processing_clap import CLAPProcessor
     from .tokenization_clap import CLAPTokenizer
-
-    try:
-        if not is_tokenizers_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .tokenization_clap_fast import CLAPTokenizerFast
-
-    try:
-        if not is_torchvision_available(): #TODO this depencie will be removed
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .feature_extraction_clap import CLAPFeatureExtractor
-
+    from .feature_extraction_clap import CLAPFeatureExtractor
+    
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()

From 6efadbf548bd2ebf6f73e61296f5c155dd3ec881 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 6 Feb 2023 15:20:29 +0000
Subject: [PATCH 058/197] style

---
 src/transformers/__init__.py                  | 10 ++++-----
 src/transformers/models/clap/__init__.py      | 12 +++++-----
 .../utils/dummy_vision_objects.py             |  1 +
 .../clap/test_feature_extraction_clap.py      | 22 +++++++++----------
 tests/models/clap/test_modeling_clap.py       |  4 ++--
 5 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 81cc07856fca..7265822dec1e 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1231,12 +1231,12 @@
     _import_structure["models.clap"].extend(
         [
             "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "CLAPAudioModel",
+            "CLAPAudioModelWithProjection",
             "CLAPModel",
             "CLAPPreTrainedModel",
             "CLAPTextModel",
             "CLAPTextModelWithProjection",
-            "CLAPAudioModel",
-            "CLAPAudioModelWithProjection",
         ]
     )
     _import_structure["models.clip"].extend(
@@ -3700,8 +3700,8 @@
         CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CLAPAudioConfig,
         CLAPConfig,
-        CLAPProcessor,
         CLAPFeatureExtractor,
+        CLAPProcessor,
         CLAPTextConfig,
     )
     from .models.clip import (
@@ -4590,12 +4590,12 @@
         )
         from .models.clap import (
             CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CLAPAudioModel,
+            CLAPAudioModelWithProjection,
             CLAPModel,
             CLAPPreTrainedModel,
             CLAPTextModel,
             CLAPTextModelWithProjection,
-            CLAPAudioModel,
-            CLAPAudioModelWithProjection,
         )
         from .models.clip import (
             CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index d8b47df7e9b1..7d7d9f0d8bca 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable,_LazyModule,is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
@@ -29,7 +29,7 @@
         "CLAPTextConfig",
     ],
     "feature_extraction_clap": ["CLAPFeatureExtractor"],
-    "processing_clap": ["CLAPProcessor"]
+    "processing_clap": ["CLAPProcessor"],
 }
 
 try:
@@ -56,10 +56,10 @@
         CLAPOnnxConfig,
         CLAPTextConfig,
     )
+    from .feature_extraction_clap import CLAPFeatureExtractor
     from .processing_clap import CLAPProcessor
     from .tokenization_clap import CLAPTokenizer
-    from .feature_extraction_clap import CLAPFeatureExtractor
-    
+
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
@@ -68,12 +68,12 @@
     else:
         from .modeling_clap import (
             CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CLAPAudioModel,
+            CLAPAudioModelWithProjection,
             CLAPModel,
             CLAPPreTrainedModel,
             CLAPTextModel,
             CLAPTextModelWithProjection,
-            CLAPAudioModel,
-            CLAPAudioModelWithProjection,
         )
 
 else:
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 7478cb3db25b..32ba0f8bd18b 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -65,6 +65,7 @@ class ChineseCLIPImageProcessor(metaclass=DummyObject):
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
+
 class CLIPFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
index 9771f9845b14..dd41342c24e9 100644
--- a/tests/models/clap/test_feature_extraction_clap.py
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -227,17 +227,17 @@ def integration_test_fusion(self):
                     -50.7827
                 ],
                 [
-                    -89.0141,  -89.1411,  -88.8096,  -88.5480,  -88.3481,  -88.2038,
-                    -88.1105,  -88.0647,  -88.0636,  -88.1051,  -88.1877,  -88.1110,
-                    -87.8613,  -88.6679,  -88.2685,  -88.9684,  -88.7977,  -89.6264,
-                    -89.9299,  -90.3184,  -91.1446,  -91.9265,  -92.7267,  -93.6099,
-                    -94.6395,  -95.3243,  -95.5923,  -95.5773,  -95.0889,  -94.3354,
-                    -93.5746,  -92.9287,  -92.4525,  -91.9798,  -91.8852,  -91.7500,
-                    -91.7259,  -91.7561,  -91.7959,  -91.7070,  -91.6914,  -91.5019,
-                    -91.0640,  -90.0807,  -88.7102,  -87.0826,  -85.5956,  -84.4441,
-                    -83.8461,  -83.8605,  -84.6702,  -86.3900,  -89.3073,  -93.2926,
-                    -96.3813,  -97.3529, -100.0000,  -99.6942,  -92.2851,  -87.9588,
-                    -85.7214,  -84.6807,  -84.1940,  -84.2021
+                    -89.0141, -89.1411, -88.8096, -88.5480, -88.3481, -88.2038,
+                    -88.1105, -88.0647, -88.0636, -88.1051, -88.1877, -88.1110,
+                    -87.8613, -88.6679, -88.2685, -88.9684, -88.7977, -89.6264,
+                    -89.9299, -90.3184, -91.1446, -91.9265, -92.7267, -93.6099,
+                    -94.6395, -95.3243, -95.5923, -95.5773, -95.0889, -94.3354,
+                    -93.5746, -92.9287, -92.4525, -91.9798, -91.8852, -91.7500,
+                    -91.7259, -91.7561, -91.7959, -91.7070, -91.6914, -91.5019,
+                    -91.0640, -90.0807, -88.7102, -87.0826, -85.5956, -84.4441,
+                    -83.8461, -83.8605, -84.6702, -86.3900, -89.3073, -93.2926,
+                    -96.3813, -97.3529, -100.0000, -99.6942, -92.2851, -87.9588,
+                    -85.7214, -84.6807, -84.1940, -84.2021
                 ],
                 [
                     -51.6882, -50.6852, -50.8198, -51.7428, -53.0325, -54.1619, -56.4903,
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 683b0f744426..741a172cc1ac 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -50,11 +50,11 @@
     from torch import nn
 
     from transformers import (
+        CLAPAudioModel,
+        CLAPAudioModelWithProjection,
         CLAPModel,
         CLAPTextModel,
         CLAPTextModelWithProjection,
-        CLAPAudioModel,
-        CLAPAudioModelWithProjection,
     )
     from transformers.models.clap.modeling_clap import CLAP_PRETRAINED_MODEL_ARCHIVE_LIST
 

From 972f45e1e54a788d808d90152706af3071e4a685 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 6 Feb 2023 15:20:44 +0000
Subject: [PATCH 059/197] re order classes in modeling clap

---
 src/transformers/models/clap/modeling_clap.py | 1441 +++++++++--------
 1 file changed, 769 insertions(+), 672 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 44a5f3d5ce9f..888b6a573c89 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -190,40 +190,6 @@ class CLAPTextModelOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-@dataclass
-# Copied from transformers.models.swin.modeling_swin.SwinEncoderOutput
-class SwinEncoderOutput(ModelOutput):
-    """
-    Swin encoder's outputs, with potential hidden states and attentions.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
-            shape `(batch_size, hidden_size, height, width)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
-            include the spatial dimensions.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-
-
 @dataclass
 class CLAPAudioModelOutput(ModelOutput):
     """
@@ -247,7 +213,7 @@ class CLAPAudioModelOutput(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLAP
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLAP vision->audio
 class CLAPOutput(ModelOutput):
     """
     Args:
@@ -265,7 +231,7 @@ class CLAPOutput(ModelOutput):
             The image embeddings obtained by applying the projection layer to the pooled output of [`CLAPAudioModel`].
         text_model_output(`BaseModelOutputWithPooling`):
             The output of the [`CLAPTextModel`].
-        vision_model_output(`BaseModelOutputWithPooling`):
+        audio_model_output(`BaseModelOutputWithPooling`):
             The output of the [`CLAPAudioModel`].
     """
 
@@ -275,11 +241,11 @@ class CLAPOutput(ModelOutput):
     text_embeds: torch.FloatTensor = None
     image_embeds: torch.FloatTensor = None
     text_model_output: BaseModelOutputWithPooling = None
-    vision_model_output: BaseModelOutputWithPooling = None
+    audio_model_output: BaseModelOutputWithPooling = None
 
     def to_tuple(self) -> Tuple[Any]:
         return tuple(
-            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            self[k] if k not in ["text_model_output", "audio_model_output"] else getattr(self, k).to_tuple()
             for k in self.keys()
         )
 
@@ -772,162 +738,507 @@ def forward(
         return layer_outputs
 
 
-CLAP_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`CLAPConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-CLAP_TEXT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
+# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->CLAPAudio
+class CLAPAudioStage(nn.Module):
+    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.blocks = nn.ModuleList(
+            [
+                CLAPAudioLayer(
+                    config=config,
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
+                )
+                for i in range(depth)
+            ]
+        )
 
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+        self.pointing = False
 
-            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        height, width = input_dimensions
+        for i, layer_module in enumerate(self.blocks):
 
-            [What are position IDs?](../glossary#position-ids)
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
+            layer_head_mask = head_mask[i] if head_mask is not None else None
 
-CLAP_AUDIO_INPUTS_DOCSTRING = r"""
-    Args:
-        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoFeatureExtractor`]. See [`CLAPFeatureExtractor.__call__`] for details.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
+            layer_outputs = layer_module(
+                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+            )
 
-CLAP_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
+            hidden_states = layer_outputs[0]
 
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+            output_dimensions = (height, width, height_downsampled, width_downsampled)
+            hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
+        else:
+            output_dimensions = (height, width, height, width)
 
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
 
-            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
 
-            [What are position IDs?](../glossary#position-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoFeatureExtractor`]. See [`CLAPFeatureExtractor.__call__`] for details.
-        return_loss (`bool`, *optional*):
-            Whether or not to return the contrastive loss.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
+# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging with Swin->CLAPAudio
+class CLAPAudioPatchMerging(nn.Module):
+    """
+    Patch Merging Layer.
 
+    Args:
+        input_resolution (`Tuple[int]`):
+            Resolution of input feature.
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
 
-class CLAPFusionBlock(nn.Module):
-    def __init__(self, config: CLAPTextConfig):
+    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
         super().__init__()
-        self.config = config
-        hidden_size = config.projection_dim
-        self.activation = ACT2FN[config.hidden_act]
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
 
-        self.linear = nn.Linear(hidden_size, hidden_size)
+    def maybe_pad(self, input_feature, height, width):
+        should_pad = (height % 2 == 1) or (width % 2 == 1)
+        if should_pad:
+            pad_values = (0, 0, 0, width % 2, 0, height % 2)
+            input_feature = nn.functional.pad(input_feature, pad_values)
 
-    def forward(self, hidden_states):
-        hidden_states = self.linear(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        return hidden_states
+        return input_feature
 
+    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
+        height, width = input_dimensions
+        # `dim` is height * width
+        batch_size, dim, num_channels = input_feature.shape
 
-class CLAPProjectionLayer(nn.Module):
-    def __init__(self, config: CLAPTextConfig):
-        super().__init__()
-        self.config = config
-        hidden_size = config.projection_hidden_size
-        projection_dim = config.projection_dim
+        input_feature = input_feature.view(batch_size, height, width, num_channels)
+        # pad input to be disible by width and height, if needed
+        input_feature = self.maybe_pad(input_feature, height, width)
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_0 = input_feature[:, 0::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_1 = input_feature[:, 1::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_2 = input_feature[:, 0::2, 1::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_3 = input_feature[:, 1::2, 1::2, :]
+        # batch_size height/2 width/2 4*num_channels
+        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
+        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # batch_size height/2*width/2 4*C
 
-        self.linear1 = nn.Linear(hidden_size, projection_dim)
-        self.activation = ACT2FN[config.projection_hidden_act]
-        self.linear2 = nn.Linear(projection_dim, projection_dim)
+        input_feature = self.norm(input_feature)
+        input_feature = self.reduction(input_feature)
 
-    def forward(self, hidden_states):
-        hidden_states = self.linear1(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        hidden_states = self.linear2(hidden_states)
-        return hidden_states
+        return input_feature
 
 
-class CLAPFusionLayer(nn.Module):
-    def __init__(self, config: CLAPTextConfig):
+class CLAPAudioEncoder(nn.Module):
+    def __init__(self, config):
         super().__init__()
-        self.config = config
+        self.num_layers = len(config.depths)
 
-        self.layers = nn.ModuleList([CLAPFusionBlock(config) for _ in range(config.fusion_num_hidden_layers)])
+        self.config = config
+        self.patch_embed = CLAPAudioPatchEmbed(config)
+        self.enable_fusion = config.enable_fusion
+        grid_size = self.patch_embed.grid_size
+        self.patch_stride = self.patch_embed.patch_stride
+        self.spec_size = config.spec_size
+        self.freq_ratio = self.spec_size // config.mel_bins
 
-    def forward(self, hidden_states):
-        for layer in self.layers:
-            hidden_states = layer(hidden_states)
-        return hidden_states
+        self.num_features = int(config.hidden_size * 2 ** (self.num_layers - 1))
+        self.freq_ratio = config.spec_size // config.mel_bins
 
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->CLAPText
-class CLAPTextEmbeddings(nn.Module):
-    """
-    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
-    """
+        self.input_resolutions = [(grid_size[0] // (2**i), grid_size[1] // (2**i)) for i in range(self.num_layers)]
 
-    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
-    def __init__(self, config):
+        self.layers = nn.ModuleList(
+            [
+                CLAPAudioStage(
+                    config=config,
+                    dim=int(config.embed_dim * 2**i_layer),
+                    input_resolution=self.input_resolutions[i_layer],
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_heads[i_layer],
+                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=CLAPAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
+                )
+                for i_layer in range(self.num_layers)
+            ]
+        )
+
+        self.gradient_checkpointing = False
+
+        self.bn0 = nn.BatchNorm2d(config.mel_bins)
+        self.norm = nn.LayerNorm(self.num_features)
+        self.depths = config.depths
+
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+
+        SF = config.spec_size // (2 ** (len(config.depths) - 1)) // self.patch_embed.patch_stride[0] // self.freq_ratio
+        self.tscam_conv = nn.Conv2d(
+            in_channels=self.num_features, out_channels=config.num_classes, kernel_size=(SF, 3), padding=(0, 1)
+        )
+        self.head = nn.Linear(config.num_classes, config.num_classes)
+
+    def reshape_wav2img(self, hidden_states):
+        _, _, time_steps, freq_steps = hidden_states.shape
+
+        target_T = int(self.spec_size * self.freq_ratio)
+        target_F = self.spec_size // self.freq_ratio
+
+        if time_steps > target_T or freq_steps > target_F:
+            raise ValueError("the wav size should less than or equal to the swin input size")
+
+        # to avoid bicubic zero error
+        if time_steps < target_T:
+            hidden_states = nn.functional.interpolate(
+                hidden_states, (target_T, hidden_states.shape[3]), mode="bicubic", align_corners=True
+            )
+        if freq_steps < target_F:
+            hidden_states = nn.functional.interpolate(
+                hidden_states, (hidden_states.shape[2], target_F), mode="bicubic", align_corners=True
+            )
+
+        # hidden_states = hidden_states.contiguous().view(hidden_states.shape[0], hidden_states.shape[1], hidden_states.shape[-1] * self.freq_ratio, hidden_states.shape[2] // self.freq_ratio)
+
+        hidden_states = hidden_states.permute(0, 1, 3, 2).contiguous()
+        hidden_states = hidden_states.reshape(
+            hidden_states.shape[0],
+            hidden_states.shape[1],
+            hidden_states.shape[2],
+            self.freq_ratio,
+            hidden_states.shape[3] // self.freq_ratio,
+        )
+
+        hidden_states = hidden_states.permute(0, 1, 3, 2, 4).contiguous()
+        hidden_states = hidden_states.reshape(
+            hidden_states.shape[0],
+            hidden_states.shape[1],
+            hidden_states.shape[2] * hidden_states.shape[3],
+            hidden_states.shape[4],
+        )
+
+        return hidden_states
+
+    def forward(
+        self,
+        input_features,
+        head_mask: Optional[torch.FloatTensor] = None,
+        is_longer: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, CLAPAudioModelOutput]:
+
+        input_features = input_features.transpose(1, 3)
+        hidden_states = self.bn0(input_features)
+        hidden_states = hidden_states.transpose(1, 3)
+
+        is_longer_list_idx = None
+        if self.enable_fusion:
+            is_longer_list = is_longer.to(input_features.device)
+            is_longer_list_idx = torch.where(is_longer_list == 0)[0]
+
+        hidden_states = self.reshape_wav2img(hidden_states)
+
+        _, _, frames_num, _ = hidden_states.shape
+
+        hidden_states = self.patch_embed(hidden_states, is_longer_list_idx)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        input_dimensions = None
+
+        if output_hidden_states:
+            batch_size, _, hidden_size = hidden_states.shape
+            # rearrange b (h w) c -> b c h w
+            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            input_dimensions = self.input_resolutions[i]
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+                )
+
+            hidden_states = layer_outputs[0]
+
+            hidden_states_before_downsampling = layer_outputs[1]
+            output_dimensions = layer_outputs[2]
+
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states_before_downsampling.shape
+                # rearrange b (h w) c -> b c h w
+                # here we use the original (not downsampled) height and width
+                reshaped_hidden_state = hidden_states_before_downsampling.view(
+                    batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
+                )
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states.shape
+                # rearrange b (h w) c -> b c h w
+                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+            if output_attentions:
+                all_self_attentions += layer_outputs[3:]
+
+        hidden_states = self.norm(hidden_states)
+
+        batch_size, _, n_channels = hidden_states.shape
+
+        freq_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
+        temporal_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
+
+        hidden_states = (
+            hidden_states.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
+        )
+
+        batch_size, n_channels, n_frequencies, n_temp = hidden_states.shape
+        # group 2D CNN
+        c_freq_bin = n_frequencies // self.freq_ratio
+        hidden_states = hidden_states.reshape(batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp)
+        hidden_states = (
+            hidden_states.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
+        )
+        # get latent_output
+        fine_grained_latent_output = torch.mean(hidden_states, dim=2)
+        fine_grained_latent_output = interpolate(
+            fine_grained_latent_output.permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
+        )
+
+        latent_output = self.avgpool(torch.flatten(hidden_states, 2))
+        latent_output = torch.flatten(latent_output, 1)
+
+        # display the attention map, if needed
+
+        hidden_states = self.tscam_conv(hidden_states)
+        hidden_states = torch.flatten(hidden_states, 2)  # B, C, T
+
+        framewise_output = interpolate(
+            torch.sigmoid(hidden_states).permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
+        )
+
+        hidden_states = self.avgpool(hidden_states)
+        hidden_states = torch.flatten(hidden_states, 1)
+
+        if not return_dict:
+            return (framewise_output, torch.sigmoid(hidden_states), fine_grained_latent_output, latent_output)
+
+        return CLAPAudioModelOutput(
+            framewise_output=framewise_output,
+            clipwise_output=torch.sigmoid(hidden_states),
+            fine_grained_embedding=fine_grained_latent_output,
+            embedding=latent_output,
+        )
+
+
+CLAP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`CLAPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLAP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLAP_AUDIO_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoFeatureExtractor`]. See [`CLAPFeatureExtractor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLAP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoFeatureExtractor`]. See [`CLAPFeatureExtractor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLAPFusionBlock(nn.Module):
+    def __init__(self, config: CLAPTextConfig):
+        super().__init__()
+        self.config = config
+        hidden_size = config.projection_dim
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.linear = nn.Linear(hidden_size, hidden_size)
+
+    def forward(self, hidden_states):
+        hidden_states = self.linear(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class CLAPProjectionLayer(nn.Module):
+    def __init__(self, config: CLAPTextConfig):
+        super().__init__()
+        self.config = config
+        hidden_size = config.projection_hidden_size
+        projection_dim = config.projection_dim
+
+        self.linear1 = nn.Linear(hidden_size, projection_dim)
+        self.activation = ACT2FN[config.projection_hidden_act]
+        self.linear2 = nn.Linear(projection_dim, projection_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.linear1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+class CLAPFusionLayer(nn.Module):
+    def __init__(self, config: CLAPTextConfig):
+        super().__init__()
+        self.config = config
+
+        self.layers = nn.ModuleList([CLAPFusionBlock(config) for _ in range(config.fusion_num_hidden_layers)])
+
+    def forward(self, hidden_states):
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->CLAPText
+class CLAPTextEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
         super().__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
@@ -1455,6 +1766,59 @@ def _init_weights(self, module):
         pass
 
 
+class CLAPAudioModel(CLAPPreTrainedModel):
+    config_class = CLAPAudioConfig
+    main_input_name = "input_features"
+
+    def __init__(self, config: CLAPAudioConfig):
+        super().__init__(config)
+        self.audio_model = CLAPAudioEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.audio_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLAPAudioConfig)
+    def forward(
+        self,
+        input_features: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLAPAudioModel
+
+        >>> model = CLAPAudioModel.from_pretrained("laionai/clap-hsat-tiny")
+        >>> processor = AutoProcessor.from_pretrained("laionai/clap-hsat-tiny")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.audio_model(
+            input_features=input_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
 class CLAPTextModel(CLAPPreTrainedModel):
     """
 
@@ -1649,218 +2013,22 @@ def __init__(self, config: CLAPConfig):
         self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value))
 
         self.projection_dim = config.projection_dim
-        self.text_hidden_size = text_config.hidden_size
-        self.vision_hidden_size = audio_config.hidden_size
-
-        self.text_model = CLAPTextModel(text_config)
-        self.text_transform = CLAPFusionLayer(text_config)
-        self.text_projection = CLAPProjectionLayer(text_config)
-
-        self.audio_model = CLAPAudioEncoder(config=audio_config)
-        self.audio_transform = CLAPFusionLayer(audio_config)
-        self.audio_projection = CLAPProjectionLayer(audio_config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
-    def get_text_features(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> torch.FloatTensor:
-        r"""
-        Returns:
-            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
-            applying the projection layer to the pooled output of [`CLAPTextModel`].
-
-        Examples:
-
-        ```python
-        >>> from transformers import AutoTokenizer, CLAPModel
-
-        >>> model = CLAPModel.from_pretrained("laion-ai/clap-htst-unfused-base")
-        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htst-unfused-base")
-
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
-        >>> text_features = model.get_text_features(**inputs)
-        ```"""
-        # Use CLAP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = text_outputs[1]
-        text_features = self.text_projection(pooled_output)
-        text_features = F.normalize(text_features, dim=-1)
-
-        return text_features
-
-    @add_start_docstrings_to_model_forward(CLAP_VISION_INPUTS_DOCSTRING)
-    def get_audio_features(
-        self,
-        input_features: Optional[torch.Tensor] = None,
-        is_longer: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> torch.FloatTensor:
-        r""" """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        audio_outputs = self.audio_model(
-            input_features=input_features,
-            is_longer=is_longer,
-            return_dict=return_dict,
-        )
-
-        pooled_output = audio_outputs[-1] if not return_dict else audio_outputs.embedding
-
-        audio_features = self.audio_projection(pooled_output)
-        audio_features = F.normalize(audio_features, dim=-1)
-
-        return audio_features
-
-    @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CLAPOutput, config_class=CLAPConfig)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        input_values: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        return_loss: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CLAPOutput]:
-        r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, CLAPModel
-
-        >>> model = CLAPModel.from_pretrained("laion-ai/clap-htst-unfused-base")
-        >>> processor = AutoProcessor.from_pretrained("laion-ai/clap-htst-unfused-base")
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> inputs = processor(
-        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
-        ... )
-
-        >>> outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
-        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
-        ```"""
-        # Use CLAP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        vision_outputs = self.audio_model(
-            input_values=input_values,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        image_embeds = vision_outputs[1]
-        image_embeds = self.visual_projection(image_embeds)
-
-        text_embeds = text_outputs[1]
-        text_embeds = self.text_projection(text_embeds)
-
-        # normalized features
-        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
-        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
-
-        # cosine similarity as logits
-        logit_scale = self.logit_scale.exp()
-        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
-        logits_per_image = logits_per_text.t()
-
-        loss = None
-        if return_loss:
-            loss = clap_loss(logits_per_text)
-
-        if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
-            return ((loss,) + output) if loss is not None else output
-
-        return CLAPOutput(
-            loss=loss,
-            logits_per_image=logits_per_image,
-            logits_per_text=logits_per_text,
-            text_embeds=text_embeds,
-            image_embeds=image_embeds,
-            text_model_output=text_outputs,
-            vision_model_output=vision_outputs,
-        )
-
-
-@add_start_docstrings(
-    """
-    CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
-    """,
-    CLAP_START_DOCSTRING,
-)
-class CLAPTextModelWithProjection(CLAPPreTrainedModel):
-    config_class = CLAPTextConfig
+        self.text_hidden_size = text_config.hidden_size
+        self.audio_hidden_size = audio_config.hidden_size
 
-    def __init__(self, config: CLAPTextConfig):
-        super().__init__(config)
-        self.text_model = CLAPTextModel(config)
-        self.text_projection = CLAPProjectionLayer(config)
-        # Initialize weights and apply final processing
-        self.post_init()
+        self.text_model = CLAPTextModel(text_config)
+        self.text_transform = CLAPFusionLayer(text_config)
+        self.text_projection = CLAPProjectionLayer(text_config)
 
-    def get_input_embeddings(self) -> nn.Module:
-        return self.text_model.embeddings.token_embedding
+        self.audio_model = CLAPAudioModel(config=audio_config)
+        self.audio_transform = CLAPFusionLayer(audio_config)
+        self.audio_projection = CLAPProjectionLayer(audio_config)
 
-    def set_input_embeddings(self, value):
-        self.text_model.embeddings.token_embedding = value
+        # Initialize weights and apply final processing
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CLAPTextModelOutput, config_class=CLAPTextConfig)
-    def forward(
+    def get_text_features(
         self,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
@@ -1868,23 +2036,28 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CLAPTextModelOutput]:
+    ) -> torch.FloatTensor:
         r"""
         Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLAPTextModel`].
 
         Examples:
 
         ```python
-        >>> from transformers import AutoTokenizer, CLAPTextModelWithProjection
+        >>> from transformers import AutoTokenizer, CLAPModel
 
-        >>> model = CLAPTextModelWithProjection.from_pretrained("laion-ai/clap-htst-unfused-base")
+        >>> model = CLAPModel.from_pretrained("laion-ai/clap-htst-unfused-base")
         >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htst-unfused-base")
 
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
-
-        >>> outputs = model(**inputs)
-        >>> text_embeds = outputs.text_embeds
+        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
         ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         text_outputs = self.text_model(
@@ -1897,361 +2070,285 @@ def forward(
         )
 
         pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+        text_features = F.normalize(text_features, dim=-1)
 
-        text_embeds = self.text_projection(pooled_output)
-
-        if not return_dict:
-            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
-            return tuple(output for output in outputs if output is not None)
+        return text_features
 
-        return CLAPTextModelOutput(
-            text_embeds=text_embeds,
-            last_hidden_state=text_outputs.last_hidden_state,
-            hidden_states=text_outputs.hidden_states,
-            attentions=text_outputs.attentions,
+    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
+    def get_audio_features(
+        self,
+        input_features: Optional[torch.Tensor] = None,
+        is_longer: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r""" """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-
-# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->CLAPAudio
-class CLAPAudioStage(nn.Module):
-    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
-        super().__init__()
-        self.config = config
-        self.dim = dim
-        self.blocks = nn.ModuleList(
-            [
-                CLAPAudioLayer(
-                    config=config,
-                    dim=dim,
-                    input_resolution=input_resolution,
-                    num_heads=num_heads,
-                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
-                )
-                for i in range(depth)
-            ]
+        audio_outputs = self.audio_model(
+            input_features=input_features,
+            is_longer=is_longer,
+            return_dict=return_dict,
         )
 
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
-        else:
-            self.downsample = None
+        pooled_output = audio_outputs[-1] if not return_dict else audio_outputs.embedding
 
-        self.pointing = False
+        audio_features = self.audio_projection(pooled_output)
+        audio_features = F.normalize(audio_features, dim=-1)
 
+        return audio_features
+
+    @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLAPOutput, config_class=CLAPConfig)
     def forward(
         self,
-        hidden_states: torch.Tensor,
-        input_dimensions: Tuple[int, int],
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-        always_partition: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        height, width = input_dimensions
-        for i, layer_module in enumerate(self.blocks):
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-
-            layer_outputs = layer_module(
-                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
-            )
-
-            hidden_states = layer_outputs[0]
-
-        hidden_states_before_downsampling = hidden_states
-        if self.downsample is not None:
-            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
-            output_dimensions = (height, width, height_downsampled, width_downsampled)
-            hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
-        else:
-            output_dimensions = (height, width, height, width)
-
-        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
-
-        if output_attentions:
-            stage_outputs += layer_outputs[1:]
-        return stage_outputs
-
-
-# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging with Swin->CLAPAudio
-class CLAPAudioPatchMerging(nn.Module):
-    """
-    Patch Merging Layer.
-
-    Args:
-        input_resolution (`Tuple[int]`):
-            Resolution of input feature.
-        dim (`int`):
-            Number of input channels.
-        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
-            Normalization layer class.
-    """
-
-    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.dim = dim
-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
-        self.norm = norm_layer(4 * dim)
-
-    def maybe_pad(self, input_feature, height, width):
-        should_pad = (height % 2 == 1) or (width % 2 == 1)
-        if should_pad:
-            pad_values = (0, 0, 0, width % 2, 0, height % 2)
-            input_feature = nn.functional.pad(input_feature, pad_values)
-
-        return input_feature
-
-    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
-        height, width = input_dimensions
-        # `dim` is height * width
-        batch_size, dim, num_channels = input_feature.shape
-
-        input_feature = input_feature.view(batch_size, height, width, num_channels)
-        # pad input to be disible by width and height, if needed
-        input_feature = self.maybe_pad(input_feature, height, width)
-        # [batch_size, height/2, width/2, num_channels]
-        input_feature_0 = input_feature[:, 0::2, 0::2, :]
-        # [batch_size, height/2, width/2, num_channels]
-        input_feature_1 = input_feature[:, 1::2, 0::2, :]
-        # [batch_size, height/2, width/2, num_channels]
-        input_feature_2 = input_feature[:, 0::2, 1::2, :]
-        # [batch_size, height/2, width/2, num_channels]
-        input_feature_3 = input_feature[:, 1::2, 1::2, :]
-        # batch_size height/2 width/2 4*num_channels
-        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
-        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # batch_size height/2*width/2 4*C
-
-        input_feature = self.norm(input_feature)
-        input_feature = self.reduction(input_feature)
+        input_ids: Optional[torch.LongTensor] = None,
+        input_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLAPOutput]:
+        r"""
+        Returns:
 
-        return input_feature
+        Examples:
 
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLAPModel
 
-class CLAPAudioEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.num_layers = len(config.depths)
+        >>> model = CLAPModel.from_pretrained("laion-ai/clap-htst-unfused-base")
+        >>> processor = AutoProcessor.from_pretrained("laion-ai/clap-htst-unfused-base")
 
-        self.config = config
-        self.patch_embed = CLAPAudioPatchEmbed(config)
-        self.enable_fusion = config.enable_fusion
-        grid_size = self.patch_embed.grid_size
-        self.patch_stride = self.patch_embed.patch_stride
-        self.spec_size = config.spec_size
-        self.freq_ratio = self.spec_size // config.mel_bins
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> # TODO audio here
 
-        self.num_features = int(config.hidden_size * 2 ** (self.num_layers - 1))
-        self.freq_ratio = config.spec_size // config.mel_bins
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
 
-        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        self.input_resolutions = [(grid_size[0] // (2**i), grid_size[1] // (2**i)) for i in range(self.num_layers)]
+        audio_outputs = self.audio_model(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
 
-        self.layers = nn.ModuleList(
-            [
-                CLAPAudioStage(
-                    config=config,
-                    dim=int(config.embed_dim * 2**i_layer),
-                    input_resolution=self.input_resolutions[i_layer],
-                    depth=config.depths[i_layer],
-                    num_heads=config.num_heads[i_layer],
-                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
-                    downsample=CLAPAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
-                )
-                for i_layer in range(self.num_layers)
-            ]
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
 
-        self.gradient_checkpointing = False
+        audio_embeds = audio_outputs[1]
+        audio_embeds = self.audio_projection(audio_embeds)
 
-        self.bn0 = nn.BatchNorm2d(config.mel_bins)
-        self.norm = nn.LayerNorm(self.num_features)
-        self.depths = config.depths
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
 
-        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        # normalized features
+        audio_embeds = audio_embeds / audio_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
 
-        SF = config.spec_size // (2 ** (len(config.depths) - 1)) // self.patch_embed.patch_stride[0] // self.freq_ratio
-        self.tscam_conv = nn.Conv2d(
-            in_channels=self.num_features, out_channels=config.num_classes, kernel_size=(SF, 3), padding=(0, 1)
-        )
-        self.head = nn.Linear(config.num_classes, config.num_classes)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, audio_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
 
-    def reshape_wav2img(self, hidden_states):
-        _, _, time_steps, freq_steps = hidden_states.shape
+        loss = None
+        if return_loss:
+            loss = clap_loss(logits_per_text)
 
-        target_T = int(self.spec_size * self.freq_ratio)
-        target_F = self.spec_size // self.freq_ratio
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, audio_embeds, text_outputs, audio_outputs)
+            return ((loss,) + output) if loss is not None else output
 
-        if time_steps > target_T or freq_steps > target_F:
-            raise ValueError("the wav size should less than or equal to the swin input size")
+        return CLAPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            audio_embeds=audio_embeds,
+            text_model_output=text_outputs,
+            audio_model_output=audio_outputs,
+        )
 
-        # to avoid bicubic zero error
-        if time_steps < target_T:
-            hidden_states = nn.functional.interpolate(
-                hidden_states, (target_T, hidden_states.shape[3]), mode="bicubic", align_corners=True
-            )
-        if freq_steps < target_F:
-            hidden_states = nn.functional.interpolate(
-                hidden_states, (hidden_states.shape[2], target_F), mode="bicubic", align_corners=True
-            )
 
-        # hidden_states = hidden_states.contiguous().view(hidden_states.shape[0], hidden_states.shape[1], hidden_states.shape[-1] * self.freq_ratio, hidden_states.shape[2] // self.freq_ratio)
+@add_start_docstrings(
+    """
+    CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
+    """,
+    CLAP_START_DOCSTRING,
+)
+class CLAPTextModelWithProjection(CLAPPreTrainedModel):
+    config_class = CLAPTextConfig
 
-        hidden_states = hidden_states.permute(0, 1, 3, 2).contiguous()
-        hidden_states = hidden_states.reshape(
-            hidden_states.shape[0],
-            hidden_states.shape[1],
-            hidden_states.shape[2],
-            self.freq_ratio,
-            hidden_states.shape[3] // self.freq_ratio,
-        )
+    def __init__(self, config: CLAPTextConfig):
+        super().__init__(config)
+        self.text_model = CLAPTextModel(config)
+        self.text_projection = CLAPProjectionLayer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
 
-        hidden_states = hidden_states.permute(0, 1, 3, 2, 4).contiguous()
-        hidden_states = hidden_states.reshape(
-            hidden_states.shape[0],
-            hidden_states.shape[1],
-            hidden_states.shape[2] * hidden_states.shape[3],
-            hidden_states.shape[4],
-        )
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
 
-        return hidden_states
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
 
+    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLAPTextModelOutput, config_class=CLAPTextConfig)
     def forward(
         self,
-        input_features,
-        head_mask: Optional[torch.FloatTensor] = None,
-        is_longer: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-        output_hidden_states: Optional[bool] = False,
-        output_hidden_states_before_downsampling: Optional[bool] = False,
-        always_partition: Optional[bool] = False,
-        return_dict: Optional[bool] = True,
-    ) -> Union[Tuple, SwinEncoderOutput]:
-
-        input_features = input_features.transpose(1, 3)
-        hidden_states = self.bn0(input_features)
-        hidden_states = hidden_states.transpose(1, 3)
-
-        is_longer_list_idx = None
-        if self.enable_fusion:
-            is_longer_list = is_longer.to(input_features.device)
-            is_longer_list_idx = torch.where(is_longer_list == 0)[0]
-
-        hidden_states = self.reshape_wav2img(hidden_states)
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLAPTextModelOutput]:
+        r"""
+        Returns:
 
-        _, _, frames_num, _ = hidden_states.shape
+        Examples:
 
-        hidden_states = self.patch_embed(hidden_states, is_longer_list_idx)
+        ```python
+        >>> from transformers import AutoTokenizer, CLAPTextModelWithProjection
 
-        all_hidden_states = () if output_hidden_states else None
-        all_reshaped_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
+        >>> model = CLAPTextModelWithProjection.from_pretrained("laion-ai/clap-htst-unfused-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htst-unfused-base")
 
-        input_dimensions = None
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
 
-        if output_hidden_states:
-            batch_size, _, hidden_size = hidden_states.shape
-            # rearrange b (h w) c -> b c h w
-            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
-            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
-            all_hidden_states += (hidden_states,)
-            all_reshaped_hidden_states += (reshaped_hidden_state,)
+        >>> outputs = model(**inputs)
+        >>> text_embeds = outputs.text_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        for i, layer_module in enumerate(self.layers):
-            layer_head_mask = head_mask[i] if head_mask is not None else None
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
 
-            input_dimensions = self.input_resolutions[i]
+        pooled_output = text_outputs[1]
 
-            if self.gradient_checkpointing and self.training:
+        text_embeds = self.text_projection(pooled_output)
 
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, output_attentions)
+        if not return_dict:
+            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
 
-                    return custom_forward
+        return CLAPTextModelOutput(
+            text_embeds=text_embeds,
+            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
+        )
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
-                )
 
-            hidden_states = layer_outputs[0]
+@add_start_docstrings(
+    """
+    CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output).
+    """,
+    CLAP_START_DOCSTRING,
+)
+class CLAPAudioModelWithProjection(CLAPPreTrainedModel):
+    config_class = CLAPAudioConfig
+    main_input_name = "input_features"
 
-            hidden_states_before_downsampling = layer_outputs[1]
-            output_dimensions = layer_outputs[2]
+    def __init__(self, config: CLAPAudioConfig):
+        super().__init__(config)
 
-            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+        self.audio_model = CLAPAudioModel(config)
 
-            if output_hidden_states and output_hidden_states_before_downsampling:
-                batch_size, _, hidden_size = hidden_states_before_downsampling.shape
-                # rearrange b (h w) c -> b c h w
-                # here we use the original (not downsampled) height and width
-                reshaped_hidden_state = hidden_states_before_downsampling.view(
-                    batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
-                )
-                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
-                all_hidden_states += (hidden_states_before_downsampling,)
-                all_reshaped_hidden_states += (reshaped_hidden_state,)
-            elif output_hidden_states and not output_hidden_states_before_downsampling:
-                batch_size, _, hidden_size = hidden_states.shape
-                # rearrange b (h w) c -> b c h w
-                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
-                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
-                all_hidden_states += (hidden_states,)
-                all_reshaped_hidden_states += (reshaped_hidden_state,)
+        self.audio_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
 
-            if output_attentions:
-                all_self_attentions += layer_outputs[3:]
+        # Initialize weights and apply final processing
+        self.post_init()
 
-        hidden_states = self.norm(hidden_states)
+    def get_input_embeddings(self) -> nn.Module:
+        return self.audio_model.embeddings.patch_embedding
 
-        batch_size, _, n_channels = hidden_states.shape
+    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLAPAudioModelOutput, config_class=CLAPAudioConfig)
+    def forward(
+        self,
+        input_features: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLAPAudioModelOutput]:
+        r"""
+        Returns:
 
-        freq_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
-        temporal_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
+        Examples:
 
-        hidden_states = (
-            hidden_states.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
-        )
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLAPAudioModelWithProjection
 
-        batch_size, n_channels, n_frequencies, n_temp = hidden_states.shape
-        # group 2D CNN
-        c_freq_bin = n_frequencies // self.freq_ratio
-        hidden_states = hidden_states.reshape(batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp)
-        hidden_states = (
-            hidden_states.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
-        )
-        # get latent_output
-        fine_grained_latent_output = torch.mean(hidden_states, dim=2)
-        fine_grained_latent_output = interpolate(
-            fine_grained_latent_output.permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
-        )
+        >>> model = CLAPAudioModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
-        latent_output = self.avgpool(torch.flatten(hidden_states, 2))
-        latent_output = torch.flatten(latent_output, 1)
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        # display the attention map, if needed
+        >>> inputs = processor(images=image, return_tensors="pt")
 
-        hidden_states = self.tscam_conv(hidden_states)
-        hidden_states = torch.flatten(hidden_states, 2)  # B, C, T
+        >>> outputs = model(**inputs)
+        >>> audio_embeds = outputs.audio_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        framewise_output = interpolate(
-            torch.sigmoid(hidden_states).permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
+        audio_outputs = self.audio_model(
+            input_features=input_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
 
-        hidden_states = self.avgpool(hidden_states)
-        hidden_states = torch.flatten(hidden_states, 1)
+        pooled_output = audio_outputs[1]  # pooled_output
+
+        audio_embeds = self.audio_projection(pooled_output)
 
         if not return_dict:
-            return (framewise_output, torch.sigmoid(hidden_states), fine_grained_latent_output, latent_output)
+            outputs = (audio_embeds, audio_outputs[0]) + audio_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
 
         return CLAPAudioModelOutput(
-            framewise_output=framewise_output,
-            clipwise_output=torch.sigmoid(hidden_states),
-            fine_grained_embedding=fine_grained_latent_output,
-            embedding=latent_output,
+            audio_embeds=audio_embeds,
+            last_hidden_state=audio_outputs.last_hidden_state,
+            hidden_states=audio_outputs.hidden_states,
+            attentions=audio_outputs.attentions,
         )

From c7b6cb9fb568947031fbba6582254645f126331b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 07:42:54 +0000
Subject: [PATCH 060/197] Use roberta tokenizer as the other weights are not
 open sourced

---
 tests/models/clap/test_processor_clap.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/tests/models/clap/test_processor_clap.py b/tests/models/clap/test_processor_clap.py
index 1e8fae168705..97904632f496 100644
--- a/tests/models/clap/test_processor_clap.py
+++ b/tests/models/clap/test_processor_clap.py
@@ -12,18 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
-import os
 import shutil
 import tempfile
 import unittest
 
-import numpy as np
-import pytest
-
-from transformers import CLAPTokenizer, CLAPTokenizerFast
+from transformers import RobertaTokenizer
+from transformers.testing_utils import require_sentencepiece, require_torchaudio
 from transformers.utils import is_torchvision_available
-from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
+
 from .test_feature_extraction_clap import floats_list
 
 
@@ -35,7 +31,6 @@
 NOTIMESTAMPS = 50362
 
 
-
 @require_torchaudio
 @require_sentencepiece
 class CLAPProcessorTest(unittest.TestCase):
@@ -44,7 +39,7 @@ def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
     def get_tokenizer(self, **kwargs):
-        return CLAPTokenizer.from_pretrained(self.checkpoint, **kwargs)
+        return RobertaTokenizer.from_pretrained(self.checkpoint, **kwargs)
 
     def get_feature_extractor(self, **kwargs):
         return CLAPFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
@@ -62,7 +57,7 @@ def test_save_load_pretrained_default(self):
         processor = CLAPProcessor.from_pretrained(self.tmpdirname)
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, CLAPTokenizer)
+        self.assertIsInstance(processor.tokenizer, RobertaTokenizer)
 
         self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
         self.assertIsInstance(processor.feature_extractor, CLAPFeatureExtractor)
@@ -79,7 +74,7 @@ def test_save_load_pretrained_additional_features(self):
         )
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, CLAPTokenizer)
+        self.assertIsInstance(processor.tokenizer, RobertaTokenizer)
 
         self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
         self.assertIsInstance(processor.feature_extractor, CLAPFeatureExtractor)

From 48b346ac3b91566caf39da48b6132a04ff14db91 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 09:01:05 +0000
Subject: [PATCH 061/197] small cleaup

---
 .../models/clap/processing_clap.py            |   2 +-
 .../models/clap/tokenization_clap.py          | 523 ------------------
 .../models/clap/tokenization_clap_fast.py     | 173 ------
 .../utils/dummy_tokenizers_objects.py         |   7 -
 tests/models/clap/test_tokenization_clap.py   | 186 -------
 utils/check_repo.py                           |   2 +
 6 files changed, 3 insertions(+), 890 deletions(-)
 delete mode 100644 src/transformers/models/clap/tokenization_clap.py
 delete mode 100644 src/transformers/models/clap/tokenization_clap_fast.py
 delete mode 100644 tests/models/clap/test_tokenization_clap.py

diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index d811f10eb6f9..300875fa2714 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-audio/Text processor class for CLAP
+Audio/Text processor class for CLAP
 """
 
 from ...processing_utils import ProcessorMixin
diff --git a/src/transformers/models/clap/tokenization_clap.py b/src/transformers/models/clap/tokenization_clap.py
deleted file mode 100644
index 623fdf60c713..000000000000
--- a/src/transformers/models/clap/tokenization_clap.py
+++ /dev/null
@@ -1,523 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for CLAP."""
-
-import json
-import os
-import unicodedata
-from functools import lru_cache
-from typing import List, Optional, Tuple
-
-import regex as re
-
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/vocab.json",
-    },
-    "merges_file": {
-        "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/merges.txt",
-    },
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "laion-ai/base": 77,
-}
-
-
-PRETRAINED_INIT_CONFIGURATION = {
-    "laion-ai/base": {},
-}
-
-
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-def whitespace_clean(text):
-    text = re.sub(r"\s+", " ", text)
-    text = text.strip()
-    return text
-
-
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
-class BasicTokenizer(object):
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-    """
-
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
-
-        Args:
-            never_split (`List[str]`, *optional*)
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)  #
-            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
-            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
-            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
-        ):  #
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-class CLAPTokenizer(PreTrainedTokenizer):
-    """
-    Construct a CLAP tokenizer. Based on byte-level Byte-Pair-Encoding.
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
-            The beginning of sequence token.
-        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
-            The end of sequence token.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        errors="replace",
-        unk_token="<|endoftext|>",
-        bos_token="<|startoftext|>",
-        eos_token="<|endoftext|>",
-        pad_token="<|endoftext|>",  # hack to enable padding
-        **kwargs
-    ):
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-
-        super().__init__(
-            errors=errors,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            **kwargs,
-        )
-
-        try:
-            import ftfy
-
-            self.fix_text = ftfy.fix_text
-        except ImportError:
-            logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
-            self.nlp = BasicTokenizer(do_lower_case=True)
-            self.fix_text = None
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"}
-
-        self.pat = re.compile(
-            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
-            re.IGNORECASE,
-        )
-
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A CLAP sequence has the following format:
-
-        - single sequence: `<|startoftext|> X <|endoftext|>`
-
-        Pairs of sequences are not the expected use case, but they will be handled without a separator.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        bos_token = [self.bos_token_id]
-        eos_token = [self.eos_token_id]
-
-        if token_ids_1 is None:
-            return bos_token + token_ids_0 + eos_token
-        return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token
-
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1] + [1] + ([0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Create a mask from the two sequences passed. CLAP does not make use of token type ids, therefore a list of
-        zeros is returned.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of zeros.
-        """
-        bos_token = [self.bos_token_id]
-        eos_token = [self.eos_token_id]
-
-        if token_ids_1 is None:
-            return len(bos_token + token_ids_0 + eos_token) * [0]
-        return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token[:-1]) + (token[-1] + "</w>",)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token + "</w>"
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        if self.fix_text is None:
-            text = " ".join(self.nlp.tokenize(text))
-        else:
-            text = whitespace_clean(self.fix_text(text)).lower()
-
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        byte_array = bytearray([self.byte_decoder[c] for c in text])
-        text = byte_array.decode("utf-8", errors=self.errors).replace("</w>", " ").strip()
-        return text
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!".format(merge_file)
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
diff --git a/src/transformers/models/clap/tokenization_clap_fast.py b/src/transformers/models/clap/tokenization_clap_fast.py
deleted file mode 100644
index 8a50fe86ffde..000000000000
--- a/src/transformers/models/clap/tokenization_clap_fast.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
-
-
-from typing import List, Optional, Tuple
-
-from tokenizers import pre_tokenizers
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_clap import CLAPTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/vocab.json",
-    },
-    "merges_file": {
-        "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/merges.txt",
-    },
-    "tokenizer_file": {
-        "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/tokenizer.json",
-    },
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "laion-ai/base": 77,
-}
-
-
-class CLAPTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" CLAP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
-    Byte-Pair-Encoding.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
-            The beginning of sequence token.
-        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
-            The end of sequence token.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = CLAPTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        unk_token="<|endoftext|>",
-        bos_token="<|startoftext|>",
-        eos_token="<|endoftext|>",
-        pad_token="<|endoftext|>",  # hack to enable padding
-        **kwargs
-    ):
-        super().__init__(
-            vocab_file,
-            merges_file,
-            tokenizer_file=tokenizer_file,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            **kwargs,
-        )
-
-        if not isinstance(self.backend_tokenizer.pre_tokenizer, pre_tokenizers.Sequence):
-            raise ValueError(
-                "The `backend_tokenizer` provided does not match the expected format. The CLAP tokenizer has been"
-                " heavily modified from transformers version 4.17.0. You need to convert the tokenizer you are using"
-                " to be compatible with this version.The easiest way to do so is"
-                ' `CLAPTokenizerFast.from_pretrained("path_to_local_folder_or_hub_repo, from_slow=True)`. If you want'
-                " to use your existing tokenizer, you will have to revert to a version prior to 4.17.0 of"
-                " transformers."
-            )
-
-        self._wrap_decode_method_backend_tokenizer()
-
-    # Very ugly hack to enable padding to have a correct decoding see https://github.com/huggingface/tokenizers/issues/872
-    def _wrap_decode_method_backend_tokenizer(self):
-        orig_decode_method = self.backend_tokenizer.decode
-
-        def new_decode_method(*args, **kwargs):
-            text = orig_decode_method(*args, **kwargs)
-            text = text.replace(self.backend_tokenizer.model.end_of_word_suffix, " ").strip()
-            return text
-
-        self.backend_tokenizer.decode = new_decode_method
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A CLAP sequence has the following format:
-
-        - single sequence: `<|startoftext|> X <|endoftext|>`
-
-        Pairs of sequences are not the expected use case, but they will be handled without a separator.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        bos_token = [self.bos_token_id]
-        eos_token = [self.eos_token_id]
-
-        if token_ids_1 is None:
-            return bos_token + token_ids_0 + eos_token
-        return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Create a mask from the two sequences passed. CLAP does not make use of token type ids, therefore a list of
-        zeros is returned.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of zeros.
-        """
-        bos_token = [self.bos_token_id]
-        eos_token = [self.eos_token_id]
-
-        if token_ids_1 is None:
-            return len(bos_token + token_ids_0 + eos_token) * [0]
-        return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py
index 218d6be718d3..8a24d9bea6b2 100644
--- a/src/transformers/utils/dummy_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_tokenizers_objects.py
@@ -66,13 +66,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
-class CLAPTokenizerFast(metaclass=DummyObject):
-    _backends = ["tokenizers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tokenizers"])
-
-
 class CLIPTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
diff --git a/tests/models/clap/test_tokenization_clap.py b/tests/models/clap/test_tokenization_clap.py
deleted file mode 100644
index 640b74b329dc..000000000000
--- a/tests/models/clap/test_tokenization_clap.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import unittest
-
-from transformers import CLAPTokenizer, CLAPTokenizerFast
-from transformers.models.clap.tokenization_clap import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_ftfy, require_tokenizers
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-@require_tokenizers
-class CLAPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = CLAPTokenizer
-    rust_tokenizer_class = CLAPTokenizerFast
-    test_rust_tokenizer = True
-    from_pretrained_kwargs = {}
-    test_seq2seq = False
-
-    def setUp(self):
-        super().setUp()
-
-        # fmt: off
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]
-        # fmt: on
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return CLAPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return CLAPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = CLAPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower newer"
-        bpe_tokens = ["lo", "w", "er</w>", "n", "e", "w", "er</w>"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [10, 2, 16, 9, 3, 2, 16, 20]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    @require_ftfy
-    def test_check_encoding_slow_fast(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
-                text_tokenized_s = tokenizer_s.tokenize(text)
-                text_tokenized_r = tokenizer_r.tokenize(text)
-
-                self.assertListEqual(text_tokenized_s, text_tokenized_r)
-
-                # Test that the tokenization is identical on an example containing a character (Latin Small Letter A
-                # with Tilde) encoded in 2 different ways
-                text = "xa\u0303y" + " " + "x\xe3y"
-                text_tokenized_s = tokenizer_s.tokenize(text)
-                text_tokenized_r = tokenizer_r.tokenize(text)
-
-                self.assertListEqual(text_tokenized_s, text_tokenized_r)
-
-                # Test that the tokenization is identical on unicode of space type
-                spaces_unicodes = [
-                    "\u0009",  # (horizontal tab, '\t')
-                    "\u000B",  # (vertical tab)
-                    "\u000C",  # (form feed)
-                    "\u0020",  # (space, ' ')
-                    "\u200E",  # (left-to-right mark):w
-                    "\u200F",  # (right-to-left mark)
-                ]
-                for unicode_seq in spaces_unicodes:
-                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
-                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
-
-                    self.assertListEqual(text_tokenized_s, text_tokenized_r)
-
-                # Test that the tokenization is identical on unicode of line break type
-                line_break_unicodes = [
-                    "\u000A",  # (line feed, '\n')
-                    "\r\n",  # (carriage return and line feed, '\r\n')
-                    "\u000D",  # (carriage return, '\r')
-                    "\r",  # (carriage return, '\r')
-                    "\u000D",  # (carriage return, '\r')
-                    "\u2028",  # (line separator)
-                    "\u2029",  # (paragraph separator)
-                    # "\u0085", # (next line)
-                ]
-
-                # The tokenization is not identical for the character "\u0085" (next line). The slow version transforms
-                # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
-                # space (and thus into an empty list).
-
-                for unicode_seq in line_break_unicodes:
-                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
-                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
-
-                    self.assertListEqual(text_tokenized_s, text_tokenized_r)
-
-    def test_offsets_mapping_with_different_add_prefix_space_argument(self):
-        # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space`
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                text_of_1_token = "hello"  # `hello` is a token in the vocabulary of `pretrained_name`
-                text = f"{text_of_1_token} {text_of_1_token}"
-
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                    pretrained_name,
-                    use_fast=True,
-                )
-                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
-                self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
-                self.assertEqual(
-                    encoding.offset_mapping[1],
-                    (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
-                )
-
-                text = f" {text}"
-
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                    pretrained_name,
-                    use_fast=True,
-                )
-                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
-                self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
-                self.assertEqual(
-                    encoding.offset_mapping[1],
-                    (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
-                )
-
-    def test_log_warning(self):
-        # Test related to the breaking change introduced in transformers v4.17.0
-        # We need to check that an error in raised when the user try to load a previous version of the tokenizer.
-        with self.assertRaises(ValueError) as context:
-            self.rust_tokenizer_class.from_pretrained("robot-test/old-clap-tokenizer")
-
-        self.assertTrue(
-            context.exception.args[0].startswith(
-                "The `backend_tokenizer` provided does not match the expected format."
-            )
-        )
-
-    @require_ftfy
-    def test_tokenization_python_rust_equals(self):
-        super().test_tokenization_python_rust_equals()
-
-    # overwrite common test
-    def test_added_tokens_do_lower_case(self):
-        # CLAP always lower cases letters
-        pass
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 9887d7a64306..f47ebe402c89 100755
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -173,6 +173,8 @@
     # models to ignore for model xxx mapping
     "CLAPTextModel",
     "CLAPTextModelWithProjection",
+    "CLAPAudioModel",
+    "CLAPAudioModelWithProjection",
     "GitVisionModel",
     "GraphormerModel",
     "GraphormerForGraphClassification",

From d299422e822c8d8af9abb9dd5b2440e41a03b8aa Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 09:04:14 +0000
Subject: [PATCH 062/197] remove tokenization CLAP

---
 src/transformers/models/auto/tokenization_auto.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 208c26eca02b..cc91c11617f0 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -90,13 +90,6 @@
             ),
             ("canine", ("CanineTokenizer", None)),
             ("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
-            (
-                "clap",
-                (
-                    " ",
-                    "CLAPTokenizerFast" if is_tokenizers_available() else None,
-                ),
-            ),
             (
                 "clip",
                 (

From f627ab99865a141944b824c772d47f1c2a7eeabb Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 10:13:40 +0000
Subject: [PATCH 063/197] processor tokenizr is roberta

---
 src/transformers/models/clap/processing_clap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index 300875fa2714..1a38026896da 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -34,7 +34,7 @@ class CLAPProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
     feature_extractor_class = "CLAPFeatureExtractor"
-    tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
+    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
 
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)

From fdcfed1616d3cf3acc9690cf9f01a1161fddf80d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 10:14:02 +0000
Subject: [PATCH 064/197] update feature extraction doc

---
 .../models/clap/feature_extraction_clap.py    | 76 +++++++++++--------
 1 file changed, 45 insertions(+), 31 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 9d9e1798ae73..e00d9f2192d9 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -41,18 +41,35 @@ class CLAPFeatureExtractor(SequenceFeatureExtractor):
 
     Args:
         feature_size (`int`, defaults to 80):
-            The feature dimension of the extracted features.
+            The feature dimension of the extracted MEL spectrograms. This corresponds to the number of frequency bins (intervals) that are computer, for each fourrier step.
         sampling_rate (`int`, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves to warn users if the audio fed to the feature extractor does not have the same sampling rate.
         hop_length (`int`, defaults to 160):
-            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
-        chunk_length (`int`, defaults to 30):
-            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
-            sequences.
+            Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split in smaller `frames` with a step of `hop_length` between each frame.
+        chunk_length_s (`int`, defaults to 10):
+            The maximum input lenght of the model in seconds. This is used to pad the audio.
         n_fft (`int`, defaults to 400):
-            Size of the Fourier transform.
+            Size of the Fourier transform. TODO will properly explain this
         padding_value (`float`, *optional*, defaults to 0.0):
             Padding value used to pad the audio. Should correspond to silences.
+        return_attention_mask (`bool`, *optional*, False):
+            Whether or not the model should return the attention masks coresponding to the input.
+        frequency_min (`float`, *optional*, 0):
+            The lowest frequency of interest. The STFT TODO (not sure) will not be computed for values below this.
+        frequency_max (`float`, *optional*, 14_000):
+            The highest frequency of interest. The STFT TODO (not sure) will not be computed for values above this.
+        top_db (`float`, *optional*):
+            The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the `SequenceFeatureExtractor._power_to_db` function
+        truncation (`str`, *optional*, `"fusions"`):
+            Truncation pattern for long audio inputs. Two patterns are available:
+                - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and  a downsampled version of the entire mel spectrogram. These 4 spectrogram will have a dimension of `n_fft, feature_size`. TODO check this
+            If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy of the original mel obtained from the padded audio. 
+                - `rand_trunc` will select a random crop of the mel spectrogram.
+        padding (`str`, *optional*, `"repeatpad"`):
+            Padding pattern for shorter audio inputs. Three patterns were originaly implemented: 
+                - `repeatpad`:
+                - `repeat`:
+                - `pad`:
     """
 
     model_input_names = ["input_features", "is_longer"]
@@ -62,15 +79,13 @@ def __init__(
         feature_size=80,
         sampling_rate=48_000,
         hop_length=480,
-        chunk_length=30,
+        chunk_length_s=10,
         n_fft=400,
         padding_value=0.0,
         return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
-        norm=None,
-        f_min: float = 0,
-        f_max: float = 14000,
+        frequency_min: float = 0,
+        frequency_max: float = 14_000,
         top_db: int = None,
-        max_length: int = 480_000,
         truncation: str = "fusion",
         padding: str = "repeatpad",
         **kwargs
@@ -82,21 +97,22 @@ def __init__(
             return_attention_mask=return_attention_mask,
             **kwargs,
         )
-        self.max_length = max_length
+        self.top_db = top_db
+        self.truncation = truncation
+        self.padding = padding
         self.n_fft = n_fft
         self.hop_length = hop_length
-        self.chunk_length = chunk_length
-        self.n_samples = chunk_length * sampling_rate
-        self.nb_max_frames = self.n_samples // hop_length
+        self.chunk_length = chunk_length_s
+        self.nb_max_samples = chunk_length_s * sampling_rate
+        self.nb_max_frames = self.nb_max_samples // hop_length
         self.sampling_rate = sampling_rate
-        self.f_min = f_min  # should be in super and would initialized them
-        self.f_max = f_max  # should be in super and would initialized them
-        self.norm = norm  # should be in super and would initialized them
+        self.frequency_min = frequency_min
+        self.frequency_max = frequency_max
         self.mel_filters = self.get_mel_filter_banks(
             n_freqs=int(1 + n_fft // 2),
             n_mels=feature_size,
-            f_min=f_min,
-            f_max=f_max,
+            frequency_min=frequency_min,
+            frequency_max=frequency_max,
             sample_rate=sampling_rate,
             norm=None,
             mel_scale="htk",
@@ -104,20 +120,19 @@ def __init__(
         self.mel_filters_slaney = self.get_mel_filter_banks(
             n_freqs=int(1 + n_fft // 2),
             n_mels=feature_size,
-            f_min=f_min,
-            f_max=f_max,
+            frequency_min=frequency_min,
+            frequency_max=frequency_max,
             sample_rate=sampling_rate,
             norm="slaney",
             mel_scale="slaney",
         )
-        self.top_db = top_db
 
     def to_dict(self) -> Dict[str, Any]:
         """
         Serializes this instance to a Python dictionary.
 
         Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, excpet for the mel filter banks, which do not need to be saved or printed as they are too long.
         """
         output = copy.deepcopy(self.__dict__)
         output["feature_extractor_type"] = self.__class__.__name__
@@ -129,8 +144,8 @@ def to_dict(self) -> Dict[str, Any]:
 
     def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
         """
-        Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
-        implementation with 1e-5 tolerance.
+        Compute the log-Mel spectrogram of the provided audio using the `hanning` window. Two different banks of filters were used:
+            - self.
         """
         window = np.hanning(self.n_fft + 1)[:-1]
 
@@ -194,7 +209,7 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink], axis=0)
         return mel_fusion
 
-    def _get_audio_features(self, waveform: np.array, max_length, truncation, padding, pad_to_multiple_of) -> np.array:
+    def _get_audio_features(self, waveform: np.array, max_length, truncation, padding) -> np.array:
         """
         Possible cases :
             - wave > max_length
@@ -336,10 +351,9 @@ def __call__(
         padded_inputs = [
             self._get_audio_features(
                 waveform,
-                max_length if max_length else self.max_length,
+                max_length if max_length else self.nb_max_samples,
                 truncation,
-                padding,
-                pad_to_multiple_of,
+                padding
             )
             for waveform in raw_speech
         ]

From 3dc78e8f324be1bf310cd8491c17b269b2daeb97 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 10:14:34 +0000
Subject: [PATCH 065/197] remove vclap from model zero shot

---
 src/transformers/models/auto/modeling_auto.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 406ebba425b6..1c875c82aa1f 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -902,7 +902,6 @@
         ("altclip", "AltCLIPModel"),
         ("blip", "BlipModel"),
         ("chinese_clip", "ChineseCLIPModel"),
-        ("clap", "CLAPModel"),
         ("clip", "CLIPModel"),
         ("clipseg", "CLIPSegModel"),
     ]

From 0fef96fcd28ba90d686b0d42eca3610865b8b0a9 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 10:14:58 +0000
Subject: [PATCH 066/197] update f_min and f_max to frequency_xx

---
 .../feature_extraction_sequence_utils.py      | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 89d19dd88c54..41069e536d96 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -389,14 +389,14 @@ def hz_to_mel(freq: float, mel_scale: str = "htk") -> float:
             return 2595.0 * math.log10(1.0 + (freq / 700.0))
 
         # Fill in the linear part
-        f_min = 0.0
+        frequency_min = 0.0
         f_sp = 200.0 / 3
 
-        mels = (freq - f_min) / f_sp
+        mels = (freq - frequency_min) / f_sp
 
         # Fill in the log-scale part
         min_log_hz = 1000.0
-        min_log_mel = (min_log_hz - f_min) / f_sp
+        min_log_mel = (min_log_hz - frequency_min) / f_sp
         logstep = math.log(6.4) / 27.0
 
         if freq >= min_log_hz:
@@ -423,13 +423,13 @@ def mel_to_hz(mels: np.array, mel_scale: str = "htk") -> np.array:
             return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
 
         # Fill in the linear scale
-        f_min = 0.0
+        frequency_min = 0.0
         f_sp = 200.0 / 3
-        freqs = f_min + f_sp * mels
+        freqs = frequency_min + f_sp * mels
 
         # And now the nonlinear scale
         min_log_hz = 1000.0
-        min_log_mel = (min_log_hz - f_min) / f_sp
+        min_log_mel = (min_log_hz - frequency_min) / f_sp
         logstep = math.log(6.4) / 27.0
 
         log_t = mels >= min_log_mel
@@ -466,8 +466,8 @@ def create_triangular_filterbank(
     def get_mel_filter_banks(
         self,
         n_freqs: int,
-        f_min: float,
-        f_max: float,
+        frequency_min: float,
+        frequency_max: float,
         n_mels: int,
         sample_rate: int,
         norm: Optional[str] = None,
@@ -498,9 +498,9 @@ def get_mel_filter_banks(
         Args:
             n_freqs (int):
                 Number of frequencies to highlight/apply
-            f_min (float):
+            frequency_min (float):
                 Minimum frequency (Hz)
-            f_max (float):
+            frequency_max (float):
                 Maximum frequency (Hz)
             n_mels (int):
                 Number of mel filterbanks
@@ -526,8 +526,8 @@ def get_mel_filter_banks(
         all_freqs = np.linspace(0, sample_rate // 2, n_freqs)
 
         # calculate mel freq bins
-        m_min = self.hz_to_mel(f_min, mel_scale=mel_scale)
-        m_max = self.hz_to_mel(f_max, mel_scale=mel_scale)
+        m_min = self.hz_to_mel(frequency_min, mel_scale=mel_scale)
+        m_max = self.hz_to_mel(frequency_max, mel_scale=mel_scale)
 
         m_pts = np.linspace(m_min, m_max, n_mels + 2)
         f_pts = self.mel_to_hz(m_pts, mel_scale=mel_scale)

From 0515a187642339e1f3bdd2dbe2f301e971f82367 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 7 Feb 2023 10:20:20 +0000
Subject: [PATCH 067/197] some changes

- fix modeling keys
- add `is_longer` in the forward pass
- make fixup
---
 src/transformers/__init__.py                          |  2 +-
 src/transformers/models/clap/configuration_clap.py    |  8 +++-----
 .../clap/convert_clap_original_pytorch_to_hf.py       |  2 +-
 .../models/clap/feature_extraction_clap.py            |  4 ++--
 src/transformers/models/clap/modeling_clap.py         | 11 +++++------
 tests/models/clap/test_feature_extraction_clap.py     |  2 --
 tests/models/clap/test_modeling_clap.py               |  6 ++----
 7 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7265822dec1e..4c9e96acbb6b 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -207,8 +207,8 @@
         "CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "CLAPAudioConfig",
         "CLAPConfig",
+        "CLAPFeatureExtractor",
         "CLAPProcessor",
-        "CLAPFeatureExtractor"
         "CLAPTextConfig",
     ],
     "models.clip": [
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 860d10de26d8..c6828d8d17e9 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -120,7 +120,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
@@ -144,7 +144,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from CLAPConfig
@@ -244,7 +243,7 @@ def __init__(
         aff_block_r=4,
         enable_patch_fusion=False,
         layer_norm_eps=1e-5,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.window_size = window_size
@@ -279,7 +278,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from CLAPConfig
@@ -352,7 +350,7 @@ def __init__(
         fusion_num_hidden_layers=2,
         projection_dim=512,
         projection_hidden_act="relu",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
index 09af4ebc0c7a..b1be783c9814 100644
--- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+++ b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
@@ -16,8 +16,8 @@
 import argparse
 
 import torch
-
 from clap import load
+
 from transformers import CLAPConfig, CLAPModel
 
 
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 9d9e1798ae73..1a991c28cef9 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -73,7 +73,7 @@ def __init__(
         max_length: int = 480_000,
         truncation: str = "fusion",
         padding: str = "repeatpad",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             feature_size=feature_size,
@@ -260,7 +260,7 @@ def __call__(
         return_attention_mask: Optional[bool] = None,
         max_length: Optional[int] = None,
         sampling_rate: Optional[int] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Main method to featurize and prepare for the model one or several sequence(s).
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 888b6a573c89..c0230554d9bb 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -775,7 +775,6 @@ def forward(
     ) -> Tuple[torch.Tensor]:
         height, width = input_dimensions
         for i, layer_module in enumerate(self.blocks):
-
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
             layer_outputs = layer_module(
@@ -954,7 +953,6 @@ def forward(
         always_partition: Optional[bool] = False,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, CLAPAudioModelOutput]:
-
         input_features = input_features.transpose(1, 3)
         hidden_states = self.bn0(input_features)
         hidden_states = hidden_states.transpose(1, 3)
@@ -1671,7 +1669,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1772,18 +1769,19 @@ class CLAPAudioModel(CLAPPreTrainedModel):
 
     def __init__(self, config: CLAPAudioConfig):
         super().__init__(config)
-        self.audio_model = CLAPAudioEncoder(config)
+        self.audio_encoder = CLAPAudioEncoder(config)
         # Initialize weights and apply final processing
         self.post_init()
 
     def get_input_embeddings(self) -> nn.Module:
-        return self.audio_model.embeddings.patch_embedding
+        return self.audio_encoder.embeddings.patch_embedding
 
     @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLAPAudioConfig)
     def forward(
         self,
         input_features: Optional[torch.FloatTensor] = None,
+        is_longer: Optional[torch.BoolTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1811,8 +1809,9 @@ def forward(
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        return self.audio_model(
+        return self.audio_encoder(
             input_features=input_features,
+            is_longer=is_longer,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
index dd41342c24e9..08a9b9578155 100644
--- a/tests/models/clap/test_feature_extraction_clap.py
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -116,7 +116,6 @@ def _flatten(list_of_lists):
 @require_torchaudio
 # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest with Whisper->CLAP
 class CLAPFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-
     feature_extraction_class = CLAPFeatureExtractor if is_speech_available() else None
 
     def setUp(self):
@@ -260,7 +259,6 @@ def integration_test_fusion(self):
         for padding, EXPECTED_VALUES, idx_in_mel in zip(
             ["repeat", "repeatpad", None], EXPECTED_INPUT_FEATURES, MEL_BIN
         ):
-
             input_features = feaure_extractor(input_speech, return_tensors="pt", padding=padding).input_features
             self.assertTrue(torch.allclose(input_features[0, idx_in_mel], EXPECTED_VALUES, atol=1e-4))
 
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 741a172cc1ac..13592d11ca78 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -21,8 +21,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 import transformers
 from transformers import CLAPAudioConfig, CLAPConfig, CLAPTextConfig
 from transformers.testing_utils import (
@@ -67,6 +67,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -347,7 +348,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class CLAPTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (CLAPTextModel, CLAPTextModelWithProjection) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
@@ -402,7 +402,6 @@ def test_model_with_projection_from_pretrained(self):
 
 class CLAPModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -579,7 +578,6 @@ def test_equivalence_pt_to_flax(self):
 
         for model_class in self.all_model_classes:
             with self.subTest(model_class.__name__):
-
                 # load PyTorch class
                 pt_model = model_class(config).eval()
                 # Flax models don't use the `use_cache` option and cache is not returned as a default.

From 0dcff26e0ca607d06e98bf39f8f3359a9b8b122b Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 7 Feb 2023 10:26:15 +0000
Subject: [PATCH 068/197] make fixup

---
 docs/source/en/index.mdx                      |  2 +-
 src/transformers/models/clap/__init__.py      |  1 +
 src/transformers/models/clap/modeling_clap.py |  2 +-
 src/transformers/utils/dummy_pt_objects.py    | 12 ++++++------
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index cabadca4a1cb..6f6a46a911ce 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -259,7 +259,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             clap              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|             clap              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index 7d7d9f0d8bca..c60aed8857db 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -30,6 +30,7 @@
     ],
     "feature_extraction_clap": ["CLAPFeatureExtractor"],
     "processing_clap": ["CLAPProcessor"],
+    "tokenization_clap": ["CLAPTokenizer"],
 }
 
 try:
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index c0230554d9bb..20b47b7c7b4b 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -213,7 +213,7 @@ class CLAPAudioModelOutput(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLAP vision->audio
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLAP,vision->audio,Vision->Audio
 class CLAPOutput(ModelOutput):
     """
     Args:
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index eba7f792eb65..45f5de470093 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1437,42 +1437,42 @@ def __init__(self, *args, **kwargs):
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class CLAPModel(metaclass=DummyObject):
+class CLAPAudioModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CLAPPreTrainedModel(metaclass=DummyObject):
+class CLAPAudioModelWithProjection(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CLAPTextModel(metaclass=DummyObject):
+class CLAPModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CLAPTextModelWithProjection(metaclass=DummyObject):
+class CLAPPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CLAPAudioModel(metaclass=DummyObject):
+class CLAPTextModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CLAPAudioModelWithProjection(metaclass=DummyObject):
+class CLAPTextModelWithProjection(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):

From fe43210ac0d0ba29b84d7837d34a6ee7c855e1c6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 13:32:54 +0000
Subject: [PATCH 069/197] consistent behavior ebtween rand_crop and fusion

---
 .../models/clap/feature_extraction_clap.py    | 52 +++++++------------
 1 file changed, 20 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index f0bcfe6d950f..7c3a017cec56 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -209,7 +209,7 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink], axis=0)
         return mel_fusion
 
-    def _get_audio_features(self, waveform: np.array, max_length, truncation, padding) -> np.array:
+    def _get_input_mel(self, waveform: np.array, max_length, truncation, padding) -> np.array:
         """
         Possible cases :
             - wave > max_length
@@ -229,7 +229,7 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
                 overflow = len(waveform) - max_length
                 idx = np.random.randint(0, overflow + 1)
                 waveform = waveform[idx : idx + max_length]
-                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None,:]
             elif truncation == "fusion":
                 mel = self._np_extract_fbank_features(waveform, self.mel_filters)
                 chunk_frames = max_length // self.hop_length + 1  # the +1 related to how the spectrogram is computed
@@ -261,7 +261,7 @@ def _get_audio_features(self, waveform: np.array, max_length, truncation, paddin
                 input_mel = self._np_extract_fbank_features(waveform, self.mel_filters)
                 input_mel = np.stack([input_mel, input_mel, input_mel, input_mel], axis=0)
             else:
-                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None, :]
 
         return input_mel, longer
 
@@ -270,11 +270,9 @@ def __call__(
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
         truncation: str = "fusion",
         padding: Optional[str] = "repeatpad",
-        pad_to_multiple_of: Optional[int] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_attention_mask: Optional[bool] = None,
         max_length: Optional[int] = None,
         sampling_rate: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchFeature:
         """
@@ -284,26 +282,16 @@ def __call__(
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                 values, a list of numpy arrays or a list of list of float values.
-            truncation (`bool`, *optional*, default to `True`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*, defaults to None):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of np.array Cores on NVIDIA hardware with compute
-                capability `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of
-                128.
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific feature_extractor's default.
-
-                [What are attention masks?](../glossary#attention-mask)
-
-                <Tip>
-
-                For CLAP models, `attention_mask` should always be passed for batched inference, to avoid subtle bugs.
-
-                </Tip>
-
+            truncation (`str`, *optional*):
+                Truncation pattern for long audio inputs. Two patterns are available:
+                    - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and  a downsampled version of the entire mel spectrogram. These 4 spectrogram will have a dimension of `n_fft, feature_size`. TODO check this
+                If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy of the original mel obtained from the padded audio. 
+                    - `rand_trunc` will select a random crop of the mel spectrogram.
+            padding (`str`, *optional*):
+                Padding pattern for shorter audio inputs. Three patterns were originaly implemented: 
+                    - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
+                    - `repeat`: the audio is repeated and then cut to fit the `max_length`
+                    - `pad`: the audio is padded.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
@@ -314,10 +302,10 @@ def __call__(
                 The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                 `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                 pipeline.
-            padding_value (`float`, defaults to 0.0):
-                The value that is used to fill the padding values / vectors.
         """
-
+        truncation = truncation if truncation is not None else self.truncation
+        padding = padding if padding else self.padding
+        
         if sampling_rate is not None:
             if sampling_rate != self.sampling_rate:
                 raise ValueError(
@@ -347,9 +335,9 @@ def __call__(
         if not is_batched:
             raw_speech = [np.asarray(raw_speech)]
 
-        # convert into correct format for padding
+        # convert to mel spectrogram, truncate and pad if needed.
         padded_inputs = [
-            self._get_audio_features(
+            self._get_input_mel(
                 waveform,
                 max_length if max_length else self.nb_max_samples,
                 truncation,
@@ -370,7 +358,7 @@ def __call__(
             is_longer[rand_idx] = True
 
         if isinstance(input_mel[0], List):
-            input_mel = [np.asarray(mel, dtype=np.float64) for feature in input_mel]
+            input_mel = [np.asarray(feature, dtype=np.float64) for feature in input_mel]
 
         input_features = {"input_features": input_mel, "is_longer": is_longer}
         input_features = BatchFeature(input_features)

From e44c75e134dda6c1e9ff823cea9204b4a89fe836 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 15:24:37 +0000
Subject: [PATCH 070/197] add numpy resize and bilinear and documentation

---
 .../models/clap/feature_extraction_clap.py    | 102 ++++++++++++++++--
 1 file changed, 94 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 7c3a017cec56..9e51cce06588 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -28,6 +28,99 @@
 
 logger = logging.get_logger(__name__)
 
+import math
+
+def bilinear_interpolation(image, y, x):
+    """
+    A bilinear interpolation of the estimated values of the `image` at non integer indexes `y` and `x`. 
+    
+    Original Image at                    Original Image at
+    x_floor, y_floor                     x_floor, y_floor 
+        +---+                               +---+       
+        | +-|-------------------------------|-+ |       
+        +---+                               +---+       
+            |                                   |         
+            |             Pixel at (x,y) where  |         
+            |             x and y non integers  |         
+            |                     +---+         |         
+            |                     |   |         |         
+            |                     +---+         |         
+        +---+                               +---+       
+        | +-|-------------------------------|-+ |       
+        +---+                               +---+       
+                                                        
+    Original Image at                    Original Image at
+    x_floor, y_floor                     x_floor, y_floor 
+    
+    The estimated value of the pixel is computed using the following equation : 
+    
+    $$
+    \text{Image}_{x,y} = \frac{1}{(x_1 - x_2)(y_2-y_1)} 
+    \begin{bmatrix} x_2 - x &   x - x_1\end{bmatrix}  
+    \begin{bmatrix}
+    \text{Image}_{x_1,y_1} &   \text{Image}_{x_2,y_1}\\
+    \text{Image}_{x_1,y_2} &   \text{Image}_{x_2,y_2}\\
+    \end{bmatrix}
+    \begin{bmatrix} y_2 - y \\  y-y_2\end{bmatrix}  
+    $$
+    
+    """
+    height = image.shape[0]
+    width = image.shape[1]
+
+    x1 = max(min(math.floor(x), width - 1), 0)
+    y1 = max(min(math.floor(y), height - 1), 0)
+    x2 = max(min(math.ceil(x), width - 1), 0)
+    y2 = max(min(math.ceil(y), height - 1), 0)
+
+    a = image[y1, x1]
+    b = image[y2, x1]
+    c = image[y1, x2]
+    d = image[y2, x2]
+
+    dx = x - x1
+    dy = y - y1
+
+    new_pixel = a * (1 - dx) * (1 - dy)
+    new_pixel += b * dy * (1 - dx)
+    new_pixel += c * dx * (1 - dy)
+    new_pixel += d * dx * dy
+    return new_pixel
+
+def resize(image, new_height, new_width):
+    """
+    Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)` with the 
+    torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])
+    
+
+    """
+    new_image = np.zeros((new_height, new_width), image.dtype)  # new_image = [[0 for _ in range(new_width)] for _ in range(new_height)]
+
+    orig_height = image.shape[0]
+    orig_width = image.shape[1]
+
+    # Compute center column and center row
+    x_orig_center = (orig_width-1) / 2
+    y_orig_center = (orig_height-1) / 2
+
+    # Compute center of resized image
+    x_scaled_center = (new_width-1) / 2
+    y_scaled_center = (new_height-1) / 2
+
+    # Compute the scale in both axes
+    scale_x = orig_width / new_width;
+    scale_y = orig_height / new_height;
+
+    for y in range(new_height):
+        for x in range(new_width):
+            # compute the coordinates of the `new pixel` at `(x, y)` in the original image.
+            x_ = (x - x_scaled_center) * scale_x + x_orig_center
+            y_ = (y - y_scaled_center) * scale_y + y_orig_center
+
+            # compute the coordinates of the 4 neighboring points and then compute the bilinear estimate.
+            new_image[y, x] = bilinear_interpolation(image, y_, x_)
+
+    return new_image
 
 class CLAPFeatureExtractor(SequenceFeatureExtractor):
     r"""
@@ -198,14 +291,7 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]
         mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
 
-        # shrink the mel TODO add this as a numpy function, also no hard codes `64`
-        mel_shrink = np.resize(mel, [chunk_frames, self.feature_size])  # current flags are probalby wrong
-        import torch
-
-        mel_shrink = torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])(torch.tensor(mel[None]))[0]
-        # logging.info(f"mel_shrink.shape: {mel_shrink.shape}")
-
-        # stack
+        mel_shrink = resize(mel, chunk_frames, self.feature_size)  # current flags are probalby wrong
         mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink], axis=0)
         return mel_fusion
 

From 43df206aa2f404155c03667903ae5482d5569e97 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 15:35:11 +0000
Subject: [PATCH 071/197] move resizing to image utils

---
 src/transformers/image_utils.py | 94 +++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 9fd563fe388a..34470c0368b9 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -17,6 +17,7 @@
 from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple, Union
 
 import numpy as np
+import math
 import requests
 from packaging import version
 
@@ -625,3 +626,96 @@ def rotate(self, image, angle, resample=None, expand=0, center=None, translate=N
         return image.rotate(
             angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
         )
+
+def bilinear_interpolation(image, y, x):
+    """
+    A bilinear interpolation of the estimated values of the `image` at non integer indexes `y` and `x`. 
+    
+    Original Image at                    Original Image at
+      x_1, y_1                             x_1, y_2 
+        +---+                               +---+       
+        | +-|-------------------------------|-+ |       
+        +---+                               +---+       
+            |                                   |         
+            |             Pixel at (x,y) where  |         
+            |             x and y non integers  |         
+            |                     +---+         |         
+            |                     |   |         |         
+            |                     +---+         |         
+        +---+                               +---+       
+        | +-|-------------------------------|-+ |       
+        +---+                               +---+       
+                                                        
+    Original Image at                    Original Image at
+      x_1, y_2                             x_2, y_2 
+    
+    The estimated value of the pixel is computed using the following equation : 
+    
+    $$
+    \text{Image}_{x,y} = \frac{1}{(x_1 - x_2)(y_2-y_1)} 
+    \begin{bmatrix} x_2 - x &   x - x_1\end{bmatrix}  
+    \begin{bmatrix}
+    \text{Image}_{x_1,y_1} &   \text{Image}_{x_2,y_1}\\
+    \text{Image}_{x_1,y_2} &   \text{Image}_{x_2,y_2}\\
+    \end{bmatrix}
+    \begin{bmatrix} y_2 - y \\  y-y_2\end{bmatrix}  
+    $$
+    
+    For more details about bilinear interplation, see [on the wikipedia page](https://en.wikipedia.org/wiki/Bilinear_interpolation)
+    """
+    height = image.shape[0]
+    width = image.shape[1]
+
+    x1 = max(min(math.floor(x), width - 1), 0)
+    y1 = max(min(math.floor(y), height - 1), 0)
+    x2 = max(min(math.ceil(x), width - 1), 0)
+    y2 = max(min(math.ceil(y), height - 1), 0)
+
+    a = image[y1, x1]
+    b = image[y2, x1]
+    c = image[y1, x2]
+    d = image[y2, x2]
+
+    dx = x - x1
+    dy = y - y1
+
+    new_pixel = a * (1 - dx) * (1 - dy)
+    new_pixel += b * dy * (1 - dx)
+    new_pixel += c * dx * (1 - dy)
+    new_pixel += d * dx * dy
+    return new_pixel
+
+def resize(image, new_height, new_width):
+    """
+    Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)` with the 
+    torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])
+    This function is not optimal in terms of performances, but has the same results as the `torchvision.transforms.resize` function
+    when called with the default `bilinear` interpolation.
+    """
+    new_image = np.zeros((new_height, new_width), image.dtype)  # new_image = [[0 for _ in range(new_width)] for _ in range(new_height)]
+
+    orig_height = image.shape[0]
+    orig_width = image.shape[1]
+
+    # Compute center column and center row
+    x_orig_center = (orig_width-1) / 2
+    y_orig_center = (orig_height-1) / 2
+
+    # Compute center of resized image
+    x_scaled_center = (new_width-1) / 2
+    y_scaled_center = (new_height-1) / 2
+
+    # Compute the scale in both axes
+    scale_x = orig_width / new_width;
+    scale_y = orig_height / new_height;
+
+    for y in range(new_height):
+        for x in range(new_width):
+            # compute the coordinates of the `new pixel` at `(x, y)` in the original image.
+            x_ = (x - x_scaled_center) * scale_x + x_orig_center
+            y_ = (y - y_scaled_center) * scale_y + y_orig_center
+
+            # compute the coordinates of the 4 neighboring points and then compute the bilinear estimate.
+            new_image[y, x] = bilinear_interpolation(image, y_, x_)
+
+    return new_image
\ No newline at end of file

From 5379926a26c33b4c2da7001d0124efdf0a01e365 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 15:35:24 +0000
Subject: [PATCH 072/197] clean feature extraction

---
 .../models/clap/feature_extraction_clap.py    | 93 -------------------
 1 file changed, 93 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 9e51cce06588..126ca84de4f2 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -28,99 +28,6 @@
 
 logger = logging.get_logger(__name__)
 
-import math
-
-def bilinear_interpolation(image, y, x):
-    """
-    A bilinear interpolation of the estimated values of the `image` at non integer indexes `y` and `x`. 
-    
-    Original Image at                    Original Image at
-    x_floor, y_floor                     x_floor, y_floor 
-        +---+                               +---+       
-        | +-|-------------------------------|-+ |       
-        +---+                               +---+       
-            |                                   |         
-            |             Pixel at (x,y) where  |         
-            |             x and y non integers  |         
-            |                     +---+         |         
-            |                     |   |         |         
-            |                     +---+         |         
-        +---+                               +---+       
-        | +-|-------------------------------|-+ |       
-        +---+                               +---+       
-                                                        
-    Original Image at                    Original Image at
-    x_floor, y_floor                     x_floor, y_floor 
-    
-    The estimated value of the pixel is computed using the following equation : 
-    
-    $$
-    \text{Image}_{x,y} = \frac{1}{(x_1 - x_2)(y_2-y_1)} 
-    \begin{bmatrix} x_2 - x &   x - x_1\end{bmatrix}  
-    \begin{bmatrix}
-    \text{Image}_{x_1,y_1} &   \text{Image}_{x_2,y_1}\\
-    \text{Image}_{x_1,y_2} &   \text{Image}_{x_2,y_2}\\
-    \end{bmatrix}
-    \begin{bmatrix} y_2 - y \\  y-y_2\end{bmatrix}  
-    $$
-    
-    """
-    height = image.shape[0]
-    width = image.shape[1]
-
-    x1 = max(min(math.floor(x), width - 1), 0)
-    y1 = max(min(math.floor(y), height - 1), 0)
-    x2 = max(min(math.ceil(x), width - 1), 0)
-    y2 = max(min(math.ceil(y), height - 1), 0)
-
-    a = image[y1, x1]
-    b = image[y2, x1]
-    c = image[y1, x2]
-    d = image[y2, x2]
-
-    dx = x - x1
-    dy = y - y1
-
-    new_pixel = a * (1 - dx) * (1 - dy)
-    new_pixel += b * dy * (1 - dx)
-    new_pixel += c * dx * (1 - dy)
-    new_pixel += d * dx * dy
-    return new_pixel
-
-def resize(image, new_height, new_width):
-    """
-    Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)` with the 
-    torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])
-    
-
-    """
-    new_image = np.zeros((new_height, new_width), image.dtype)  # new_image = [[0 for _ in range(new_width)] for _ in range(new_height)]
-
-    orig_height = image.shape[0]
-    orig_width = image.shape[1]
-
-    # Compute center column and center row
-    x_orig_center = (orig_width-1) / 2
-    y_orig_center = (orig_height-1) / 2
-
-    # Compute center of resized image
-    x_scaled_center = (new_width-1) / 2
-    y_scaled_center = (new_height-1) / 2
-
-    # Compute the scale in both axes
-    scale_x = orig_width / new_width;
-    scale_y = orig_height / new_height;
-
-    for y in range(new_height):
-        for x in range(new_width):
-            # compute the coordinates of the `new pixel` at `(x, y)` in the original image.
-            x_ = (x - x_scaled_center) * scale_x + x_orig_center
-            y_ = (y - y_scaled_center) * scale_y + y_orig_center
-
-            # compute the coordinates of the 4 neighboring points and then compute the bilinear estimate.
-            new_image[y, x] = bilinear_interpolation(image, y_, x_)
-
-    return new_image
 
 class CLAPFeatureExtractor(SequenceFeatureExtractor):
     r"""

From 107c5448d7022d45aa7c0787ecdfeb62550eb92e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 16:22:26 +0000
Subject: [PATCH 073/197] import resize from correct file

---
 src/transformers/models/clap/feature_extraction_clap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 126ca84de4f2..dcd8c10c21d8 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -24,7 +24,7 @@
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
-
+from ...image_utils import resize
 
 logger = logging.get_logger(__name__)
 

From 7eb278dd0793d232f94ef51770f7935a40f5f5c9 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 16:22:44 +0000
Subject: [PATCH 074/197] resize in image transforms

---
 src/transformers/image_transforms.py | 95 +++++++++++++++++++++++++++
 src/transformers/image_utils.py      | 96 +---------------------------
 2 files changed, 96 insertions(+), 95 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index d09f29b79044..2f1a203fb726 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -17,6 +17,7 @@
 from typing import Iterable, List, Optional, Tuple, Union
 
 import numpy as np
+import math
 
 from transformers.image_utils import (
     ChannelDimension,
@@ -707,3 +708,97 @@ def convert_to_rgb(image: ImageInput) -> ImageInput:
 
     image = image.convert("RGB")
     return image
+
+
+def bilinear_interpolation(image, y, x):
+    """
+    A bilinear interpolation of the estimated values of the `image` at non integer indexes `y` and `x`. 
+    
+    Original Image at                    Original Image at
+      x_1, y_1                             x_1, y_2 
+        +---+                               +---+       
+        | +-|-------------------------------|-+ |       
+        +---+                               +---+       
+            |                                   |         
+            |             Pixel at (x,y) where  |         
+            |             x and y non integers  |         
+            |                     +---+         |         
+            |                     |   |         |         
+            |                     +---+         |         
+        +---+                               +---+       
+        | +-|-------------------------------|-+ |       
+        +---+                               +---+       
+                                                        
+    Original Image at                    Original Image at
+      x_1, y_2                             x_2, y_2 
+    
+    The estimated value of the pixel is computed using the following equation : 
+    
+    $$
+    \text{Image}_{x,y} = \frac{1}{(x_1 - x_2)(y_2-y_1)} 
+    \begin{bmatrix} x_2 - x &   x - x_1\end{bmatrix}  
+    \begin{bmatrix}
+    \text{Image}_{x_1,y_1} &   \text{Image}_{x_2,y_1}\\
+    \text{Image}_{x_1,y_2} &   \text{Image}_{x_2,y_2}\\
+    \end{bmatrix}
+    \begin{bmatrix} y_2 - y \\  y-y_2\end{bmatrix}  
+    $$
+    
+    For more details about bilinear interplation, see [on the wikipedia page](https://en.wikipedia.org/wiki/Bilinear_interpolation)
+    """
+    height = image.shape[0]
+    width = image.shape[1]
+
+    x1 = max(min(math.floor(x), width - 1), 0)
+    y1 = max(min(math.floor(y), height - 1), 0)
+    x2 = max(min(math.ceil(x), width - 1), 0)
+    y2 = max(min(math.ceil(y), height - 1), 0)
+
+    a = image[y1, x1]
+    b = image[y2, x1]
+    c = image[y1, x2]
+    d = image[y2, x2]
+
+    dx = x - x1
+    dy = y - y1
+
+    new_pixel = a * (1 - dx) * (1 - dy)
+    new_pixel += b * dy * (1 - dx)
+    new_pixel += c * dx * (1 - dy)
+    new_pixel += d * dx * dy
+    return new_pixel
+
+def resize(image, new_height, new_width):
+    """
+    Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)` with the 
+    torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])
+    This function is not optimal in terms of performances, but has the same results as the `torchvision.transforms.resize` function
+    when called with the default `bilinear` interpolation.
+    """
+    new_image = np.zeros((new_height, new_width), image.dtype)  # new_image = [[0 for _ in range(new_width)] for _ in range(new_height)]
+
+    orig_height = image.shape[0]
+    orig_width = image.shape[1]
+
+    # Compute center column and center row
+    x_orig_center = (orig_width-1) / 2
+    y_orig_center = (orig_height-1) / 2
+
+    # Compute center of resized image
+    x_scaled_center = (new_width-1) / 2
+    y_scaled_center = (new_height-1) / 2
+
+    # Compute the scale in both axes
+    scale_x = orig_width / new_width;
+    scale_y = orig_height / new_height;
+
+    for y in range(new_height):
+        for x in range(new_width):
+            # compute the coordinates of the `new pixel` at `(x, y)` in the original image.
+            x_ = (x - x_scaled_center) * scale_x + x_orig_center
+            y_ = (y - y_scaled_center) * scale_y + y_orig_center
+
+            # compute the coordinates of the 4 neighboring points and then compute the bilinear estimate.
+            new_image[y, x] = bilinear_interpolation(image, y_, x_)
+
+    return new_image
\ No newline at end of file
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 34470c0368b9..21afe109a3c2 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -17,7 +17,6 @@
 from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple, Union
 
 import numpy as np
-import math
 import requests
 from packaging import version
 
@@ -625,97 +624,4 @@ def rotate(self, image, angle, resample=None, expand=0, center=None, translate=N
 
         return image.rotate(
             angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
-        )
-
-def bilinear_interpolation(image, y, x):
-    """
-    A bilinear interpolation of the estimated values of the `image` at non integer indexes `y` and `x`. 
-    
-    Original Image at                    Original Image at
-      x_1, y_1                             x_1, y_2 
-        +---+                               +---+       
-        | +-|-------------------------------|-+ |       
-        +---+                               +---+       
-            |                                   |         
-            |             Pixel at (x,y) where  |         
-            |             x and y non integers  |         
-            |                     +---+         |         
-            |                     |   |         |         
-            |                     +---+         |         
-        +---+                               +---+       
-        | +-|-------------------------------|-+ |       
-        +---+                               +---+       
-                                                        
-    Original Image at                    Original Image at
-      x_1, y_2                             x_2, y_2 
-    
-    The estimated value of the pixel is computed using the following equation : 
-    
-    $$
-    \text{Image}_{x,y} = \frac{1}{(x_1 - x_2)(y_2-y_1)} 
-    \begin{bmatrix} x_2 - x &   x - x_1\end{bmatrix}  
-    \begin{bmatrix}
-    \text{Image}_{x_1,y_1} &   \text{Image}_{x_2,y_1}\\
-    \text{Image}_{x_1,y_2} &   \text{Image}_{x_2,y_2}\\
-    \end{bmatrix}
-    \begin{bmatrix} y_2 - y \\  y-y_2\end{bmatrix}  
-    $$
-    
-    For more details about bilinear interplation, see [on the wikipedia page](https://en.wikipedia.org/wiki/Bilinear_interpolation)
-    """
-    height = image.shape[0]
-    width = image.shape[1]
-
-    x1 = max(min(math.floor(x), width - 1), 0)
-    y1 = max(min(math.floor(y), height - 1), 0)
-    x2 = max(min(math.ceil(x), width - 1), 0)
-    y2 = max(min(math.ceil(y), height - 1), 0)
-
-    a = image[y1, x1]
-    b = image[y2, x1]
-    c = image[y1, x2]
-    d = image[y2, x2]
-
-    dx = x - x1
-    dy = y - y1
-
-    new_pixel = a * (1 - dx) * (1 - dy)
-    new_pixel += b * dy * (1 - dx)
-    new_pixel += c * dx * (1 - dy)
-    new_pixel += d * dx * dy
-    return new_pixel
-
-def resize(image, new_height, new_width):
-    """
-    Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)` with the 
-    torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])
-    This function is not optimal in terms of performances, but has the same results as the `torchvision.transforms.resize` function
-    when called with the default `bilinear` interpolation.
-    """
-    new_image = np.zeros((new_height, new_width), image.dtype)  # new_image = [[0 for _ in range(new_width)] for _ in range(new_height)]
-
-    orig_height = image.shape[0]
-    orig_width = image.shape[1]
-
-    # Compute center column and center row
-    x_orig_center = (orig_width-1) / 2
-    y_orig_center = (orig_height-1) / 2
-
-    # Compute center of resized image
-    x_scaled_center = (new_width-1) / 2
-    y_scaled_center = (new_height-1) / 2
-
-    # Compute the scale in both axes
-    scale_x = orig_width / new_width;
-    scale_y = orig_height / new_height;
-
-    for y in range(new_height):
-        for x in range(new_width):
-            # compute the coordinates of the `new pixel` at `(x, y)` in the original image.
-            x_ = (x - x_scaled_center) * scale_x + x_orig_center
-            y_ = (y - y_scaled_center) * scale_y + y_orig_center
-
-            # compute the coordinates of the 4 neighboring points and then compute the bilinear estimate.
-            new_image[y, x] = bilinear_interpolation(image, y_, x_)
-
-    return new_image
\ No newline at end of file
+        )
\ No newline at end of file

From a4c1940e73f8fcfb101fbf6987619ec90a1d5de5 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 16:29:28 +0000
Subject: [PATCH 075/197] update

---
 src/transformers/image_transforms.py                    | 2 +-
 src/transformers/models/clap/feature_extraction_clap.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 2f1a203fb726..c2361b18d115 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -768,7 +768,7 @@ def bilinear_interpolation(image, y, x):
     new_pixel += d * dx * dy
     return new_pixel
 
-def resize(image, new_height, new_width):
+def np_bilinear_resize(image, new_height, new_width):
     """
     Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)` with the 
     torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index dcd8c10c21d8..80223adf105a 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -19,12 +19,11 @@
 from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
-import torchvision
 
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
-from ...image_utils import resize
+from ...image_transforms import np_bilinear_resize
 
 logger = logging.get_logger(__name__)
 
@@ -198,7 +197,7 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]
         mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
 
-        mel_shrink = resize(mel, chunk_frames, self.feature_size)  # current flags are probalby wrong
+        mel_shrink = np_bilinear_resize(mel, chunk_frames, self.feature_size)  # current flags are probalby wrong
         mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink], axis=0)
         return mel_fusion
 

From 9594cc5ab45f40dfde638009d32de0b25998f44b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 16:50:24 +0000
Subject: [PATCH 076/197] style

---
 docs/source/en/model_doc/clap.mdx             | 26 ++++++++++---------
 src/transformers/image_transforms.py          | 21 ++++++++-------
 src/transformers/image_utils.py               |  2 +-
 .../models/clap/feature_extraction_clap.py    | 19 +++++---------
 4 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/docs/source/en/model_doc/clap.mdx b/docs/source/en/model_doc/clap.mdx
index b9c19a0b6682..c5918ca576da 100644
--- a/docs/source/en/model_doc/clap.mdx
+++ b/docs/source/en/model_doc/clap.mdx
@@ -14,25 +14,27 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The clap model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
+The clap model was proposed in [Large Scale Constrastive Laungaue-Audio pretraining with
+feature fusion and keyword-to-caption augmentation](https://arxiv.org/pdf/2211.06687.pdf) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+
+The CLAP model uses a SWINTransformer on the input fused mel spectrogram as the audio encoder, and a ROBerta model for the text emcoder.
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*Contrastive learning has shown remarkable success in the field of multimodal representation learning. In this paper, we propose a pipeline of contrastive language-audio pretraining to develop an audio representation by combining audio data with natural language descriptions. To accomplish this target, we first release LAION-Audio-630K, a large collection of 633,526 audio-text pairs from different data sources. Second, we construct a contrastive language-audio pretraining model by considering different audio encoders and text encoders. We incorporate the feature fusion mechanism and keyword-to-caption augmentation into the model design to further enable the model to process audio inputs of variable lengths and enhance the performance. Third, we perform comprehensive experiments to evaluate our model across three tasks: text-to-audio retrieval, zero-shot audio classification, and supervised audio classification. The results demonstrate that our model achieves superior performance in text-to-audio retrieval task. In audio classification tasks, the model achieves state-of-the-art performance in the zeroshot setting and is able to obtain performance comparable to models' results in the non-zero-shot setting. LAION-Audio-6*
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+- TODOS
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArtZucker) .
+The original code can be found [here](https://github.com/LAION-AI/CLAP).
 
 
 ## CLAPConfig
 
 [[autodoc]] CLAPConfig
-    - from_text_vision_configs
+    - from_text_audio_configs
 
 ## CLAPTextConfig
 
@@ -79,13 +81,13 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 [[autodoc]] CLAPTextModelWithProjection
     - forward
 
-## CLAPAudioModelWithProjection
+## CLAPAudioModel
 
-[[autodoc]] CLAPAudioModelWithProjection
+[[autodoc]] CLAPAudioModel
     - forward
 
+## CLAPAudioModelWithProjection
 
-## CLAPAudioModel
-
-[[autodoc]] CLAPAudioModel
+[[autodoc]] CLAPAudioModelWithProjection
     - forward
+
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index c2361b18d115..6dc4e8a77b8b 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -768,29 +768,32 @@ def bilinear_interpolation(image, y, x):
     new_pixel += d * dx * dy
     return new_pixel
 
+
 def np_bilinear_resize(image, new_height, new_width):
     """
-    Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)` with the 
+    Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)` with the
     torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])
     This function is not optimal in terms of performances, but has the same results as the `torchvision.transforms.resize` function
     when called with the default `bilinear` interpolation.
     """
-    new_image = np.zeros((new_height, new_width), image.dtype)  # new_image = [[0 for _ in range(new_width)] for _ in range(new_height)]
+    new_image = np.zeros(
+        (new_height, new_width), image.dtype
+    )  # new_image = [[0 for _ in range(new_width)] for _ in range(new_height)]
 
     orig_height = image.shape[0]
     orig_width = image.shape[1]
 
     # Compute center column and center row
-    x_orig_center = (orig_width-1) / 2
-    y_orig_center = (orig_height-1) / 2
+    x_orig_center = (orig_width - 1) / 2
+    y_orig_center = (orig_height - 1) / 2
 
     # Compute center of resized image
-    x_scaled_center = (new_width-1) / 2
-    y_scaled_center = (new_height-1) / 2
+    x_scaled_center = (new_width - 1) / 2
+    y_scaled_center = (new_height - 1) / 2
 
     # Compute the scale in both axes
-    scale_x = orig_width / new_width;
-    scale_y = orig_height / new_height;
+    scale_x = orig_width / new_width
+    scale_y = orig_height / new_height
 
     for y in range(new_height):
         for x in range(new_width):
@@ -801,4 +804,4 @@ def np_bilinear_resize(image, new_height, new_width):
             # compute the coordinates of the 4 neighboring points and then compute the bilinear estimate.
             new_image[y, x] = bilinear_interpolation(image, y_, x_)
 
-    return new_image
\ No newline at end of file
+    return new_image
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 21afe109a3c2..9fd563fe388a 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -624,4 +624,4 @@ def rotate(self, image, angle, resample=None, expand=0, center=None, translate=N
 
         return image.rotate(
             angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
-        )
\ No newline at end of file
+        )
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 80223adf105a..77af3822b5a2 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -62,10 +62,10 @@ class CLAPFeatureExtractor(SequenceFeatureExtractor):
         truncation (`str`, *optional*, `"fusions"`):
             Truncation pattern for long audio inputs. Two patterns are available:
                 - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and  a downsampled version of the entire mel spectrogram. These 4 spectrogram will have a dimension of `n_fft, feature_size`. TODO check this
-            If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy of the original mel obtained from the padded audio. 
+            If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy of the original mel obtained from the padded audio.
                 - `rand_trunc` will select a random crop of the mel spectrogram.
         padding (`str`, *optional*, `"repeatpad"`):
-            Padding pattern for shorter audio inputs. Three patterns were originaly implemented: 
+            Padding pattern for shorter audio inputs. Three patterns were originaly implemented:
                 - `repeatpad`:
                 - `repeat`:
                 - `pad`:
@@ -221,7 +221,7 @@ def _get_input_mel(self, waveform: np.array, max_length, truncation, padding) ->
                 overflow = len(waveform) - max_length
                 idx = np.random.randint(0, overflow + 1)
                 waveform = waveform[idx : idx + max_length]
-                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None,:]
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None, :]
             elif truncation == "fusion":
                 mel = self._np_extract_fbank_features(waveform, self.mel_filters)
                 chunk_frames = max_length // self.hop_length + 1  # the +1 related to how the spectrogram is computed
@@ -277,10 +277,10 @@ def __call__(
             truncation (`str`, *optional*):
                 Truncation pattern for long audio inputs. Two patterns are available:
                     - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and  a downsampled version of the entire mel spectrogram. These 4 spectrogram will have a dimension of `n_fft, feature_size`. TODO check this
-                If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy of the original mel obtained from the padded audio. 
+                If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy of the original mel obtained from the padded audio.
                     - `rand_trunc` will select a random crop of the mel spectrogram.
             padding (`str`, *optional*):
-                Padding pattern for shorter audio inputs. Three patterns were originaly implemented: 
+                Padding pattern for shorter audio inputs. Three patterns were originaly implemented:
                     - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
                     - `repeat`: the audio is repeated and then cut to fit the `max_length`
                     - `pad`: the audio is padded.
@@ -297,7 +297,7 @@ def __call__(
         """
         truncation = truncation if truncation is not None else self.truncation
         padding = padding if padding else self.padding
-        
+
         if sampling_rate is not None:
             if sampling_rate != self.sampling_rate:
                 raise ValueError(
@@ -329,12 +329,7 @@ def __call__(
 
         # convert to mel spectrogram, truncate and pad if needed.
         padded_inputs = [
-            self._get_input_mel(
-                waveform,
-                max_length if max_length else self.nb_max_samples,
-                truncation,
-                padding
-            )
+            self._get_input_mel(waveform, max_length if max_length else self.nb_max_samples, truncation, padding)
             for waveform in raw_speech
         ]
 

From 51d2b6455fece584d45fd6adbc0bf2d723d14adc Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 17:03:15 +0000
Subject: [PATCH 077/197] style

---
 src/transformers/image_transforms.py          | 52 +++++++------------
 .../models/clap/feature_extraction_clap.py    | 32 ++++++++----
 2 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 6dc4e8a77b8b..50dcc4459970 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 import warnings
 from typing import Iterable, List, Optional, Tuple, Union
 
 import numpy as np
-import math
 
 from transformers.image_utils import (
     ChannelDimension,
@@ -712,39 +712,25 @@ def convert_to_rgb(image: ImageInput) -> ImageInput:
 
 def bilinear_interpolation(image, y, x):
     """
-    A bilinear interpolation of the estimated values of the `image` at non integer indexes `y` and `x`. 
+    A bilinear interpolation of the estimated values of the `image` at non integer indexes `y` and `x`.
     
-    Original Image at                    Original Image at
-      x_1, y_1                             x_1, y_2 
-        +---+                               +---+       
-        | +-|-------------------------------|-+ |       
-        +---+                               +---+       
-            |                                   |         
-            |             Pixel at (x,y) where  |         
-            |             x and y non integers  |         
-            |                     +---+         |         
-            |                     |   |         |         
-            |                     +---+         |         
-        +---+                               +---+       
-        | +-|-------------------------------|-+ |       
-        +---+                               +---+       
+    Original Image at Original Image at
+      x_1, y_1 x_1, y_2
+        +---+ +---+ | +-|-------------------------------|-+ | +---+ +---+
+            | | | Pixel at (x,y) where | | x and y non integers | | +---+ | | | | | | +---+ |
+        +---+ +---+ | +-|-------------------------------|-+ | +---+ +---+
                                                         
-    Original Image at                    Original Image at
-      x_1, y_2                             x_2, y_2 
+    Original Image at Original Image at
+      x_1, y_2 x_2, y_2
     
-    The estimated value of the pixel is computed using the following equation : 
+    The estimated value of the pixel is computed using the following equation :
     
-    $$
-    \text{Image}_{x,y} = \frac{1}{(x_1 - x_2)(y_2-y_1)} 
-    \begin{bmatrix} x_2 - x &   x - x_1\end{bmatrix}  
-    \begin{bmatrix}
-    \text{Image}_{x_1,y_1} &   \text{Image}_{x_2,y_1}\\
-    \text{Image}_{x_1,y_2} &   \text{Image}_{x_2,y_2}\\
-    \end{bmatrix}
-    \begin{bmatrix} y_2 - y \\  y-y_2\end{bmatrix}  
-    $$
+    $$ \text{Image}_{x,y} = \frac{1}{(x_1 - x_2)(y_2-y_1)} \begin{bmatrix} x_2 - x & x - x_1\end{bmatrix}
+    \begin{bmatrix} \text{Image}_{x_1,y_1} & \text{Image}_{x_2,y_1}\\ \text{Image}_{x_1,y_2} & \text{Image}_{x_2,y_2}\\
+    \end{bmatrix} \begin{bmatrix} y_2 - y \\ y-y_2\end{bmatrix} $$
     
-    For more details about bilinear interplation, see [on the wikipedia page](https://en.wikipedia.org/wiki/Bilinear_interpolation)
+    For more details about bilinear interplation, see [on the wikipedia
+    page](https://en.wikipedia.org/wiki/Bilinear_interpolation)
     """
     height = image.shape[0]
     width = image.shape[1]
@@ -771,10 +757,10 @@ def bilinear_interpolation(image, y, x):
 
 def np_bilinear_resize(image, new_height, new_width):
     """
-    Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)` with the
-    torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])
-    This function is not optimal in terms of performances, but has the same results as the `torchvision.transforms.resize` function
-    when called with the default `bilinear` interpolation.
+    Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)`
+    with the torchvision.transforms.Resize(size=[chunk_frames, self.feature_size]) This function is not optimal in
+    terms of performances, but has the same results as the `torchvision.transforms.resize` function when called with
+    the default `bilinear` interpolation.
     """
     new_image = np.zeros(
         (new_height, new_width), image.dtype
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 77af3822b5a2..52450a783b6c 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -22,8 +22,9 @@
 
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
-from ...utils import TensorType, logging
 from ...image_transforms import np_bilinear_resize
+from ...utils import TensorType, logging
+
 
 logger = logging.get_logger(__name__)
 
@@ -40,11 +41,14 @@ class CLAPFeatureExtractor(SequenceFeatureExtractor):
 
     Args:
         feature_size (`int`, defaults to 80):
-            The feature dimension of the extracted MEL spectrograms. This corresponds to the number of frequency bins (intervals) that are computer, for each fourrier step.
+            The feature dimension of the extracted MEL spectrograms. This corresponds to the number of frequency bins
+            (intervals) that are computer, for each fourrier step.
         sampling_rate (`int`, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves to warn users if the audio fed to the feature extractor does not have the same sampling rate.
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
+            to warn users if the audio fed to the feature extractor does not have the same sampling rate.
         hop_length (`int`, defaults to 160):
-            Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split in smaller `frames` with a step of `hop_length` between each frame.
+            Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
+            in smaller `frames` with a step of `hop_length` between each frame.
         chunk_length_s (`int`, defaults to 10):
             The maximum input lenght of the model in seconds. This is used to pad the audio.
         n_fft (`int`, defaults to 400):
@@ -58,11 +62,15 @@ class CLAPFeatureExtractor(SequenceFeatureExtractor):
         frequency_max (`float`, *optional*, 14_000):
             The highest frequency of interest. The STFT TODO (not sure) will not be computed for values above this.
         top_db (`float`, *optional*):
-            The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the `SequenceFeatureExtractor._power_to_db` function
+            The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the
+            `SequenceFeatureExtractor._power_to_db` function
         truncation (`str`, *optional*, `"fusions"`):
             Truncation pattern for long audio inputs. Two patterns are available:
-                - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and  a downsampled version of the entire mel spectrogram. These 4 spectrogram will have a dimension of `n_fft, feature_size`. TODO check this
-            If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy of the original mel obtained from the padded audio.
+                - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a
+                  downsampled version of the entire mel spectrogram. These 4 spectrogram will have a dimension of
+                  `n_fft, feature_size`. TODO check this
+            If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy
+            of the original mel obtained from the padded audio.
                 - `rand_trunc` will select a random crop of the mel spectrogram.
         padding (`str`, *optional*, `"repeatpad"`):
             Padding pattern for shorter audio inputs. Three patterns were originaly implemented:
@@ -131,7 +139,8 @@ def to_dict(self) -> Dict[str, Any]:
         Serializes this instance to a Python dictionary.
 
         Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, excpet for the mel filter banks, which do not need to be saved or printed as they are too long.
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, excpet for the
+            mel filter banks, which do not need to be saved or printed as they are too long.
         """
         output = copy.deepcopy(self.__dict__)
         output["feature_extractor_type"] = self.__class__.__name__
@@ -276,8 +285,11 @@ def __call__(
                 values, a list of numpy arrays or a list of list of float values.
             truncation (`str`, *optional*):
                 Truncation pattern for long audio inputs. Two patterns are available:
-                    - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and  a downsampled version of the entire mel spectrogram. These 4 spectrogram will have a dimension of `n_fft, feature_size`. TODO check this
-                If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy of the original mel obtained from the padded audio.
+                    - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and
+                      a downsampled version of the entire mel spectrogram. These 4 spectrogram will have a dimension of
+                      `n_fft, feature_size`. TODO check this
+                If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a
+                copy of the original mel obtained from the padded audio.
                     - `rand_trunc` will select a random crop of the mel spectrogram.
             padding (`str`, *optional*):
                 Padding pattern for shorter audio inputs. Three patterns were originaly implemented:

From ce79e64b57e9db63f81ea3579fefd5ff95cd5bfe Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 19:36:17 +0000
Subject: [PATCH 078/197] nit

---
 src/transformers/models/clap/feature_extraction_clap.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 52450a783b6c..63312ae6da93 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -156,11 +156,9 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[n
             - self.
         """
         window = np.hanning(self.n_fft + 1)[:-1]
-
         frames = self._fram_wave(waveform)
         stft = self._stft(frames, window=window)
 
-        # if the imaginary parts are taken : (real, imag) = stftl; real ** 2 + imag ** 2
         magnitudes = np.abs(stft) ** 2
         mel_spec = np.matmul(mel_filters.T, magnitudes)
         log_mel_spec = self._power_to_db(mel_spec)
@@ -198,9 +196,9 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
             # if the audio is too short, we just use the first chunk
             ranges[2] = [0]
         # randomly choose index for each part
-        idx_front = np.random.choice(ranges[0])
-        idx_middle = np.random.choice(ranges[1])
-        idx_back = np.random.choice(ranges[2])
+        idx_front = np.random.choice(ranges[0])  #172
+        idx_middle = np.random.choice(ranges[1]) #508
+        idx_back = np.random.choice(ranges[2])   #1039
         # select mel
         mel_chunk_front = mel[idx_front : idx_front + chunk_frames, :]
         mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]

From 1bfd7ddfd13ac301597a85aeb54d95860354170e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 19:57:29 +0000
Subject: [PATCH 079/197] remove unused arguments form the feature extractor

---
 src/transformers/models/clap/feature_extraction_clap.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 63312ae6da93..e12eb50760d9 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -109,9 +109,8 @@ def __init__(
         self.padding = padding
         self.n_fft = n_fft
         self.hop_length = hop_length
-        self.chunk_length = chunk_length_s
+        self.chunk_length_s = chunk_length_s
         self.nb_max_samples = chunk_length_s * sampling_rate
-        self.nb_max_frames = self.nb_max_samples // hop_length
         self.sampling_rate = sampling_rate
         self.frequency_min = frequency_min
         self.frequency_max = frequency_max

From 6794f1633c199bb769517cbc7ab0a88a7fa37310 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 7 Feb 2023 19:57:54 +0000
Subject: [PATCH 080/197] style

---
 src/transformers/models/clap/feature_extraction_clap.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index e12eb50760d9..4b3114cd20ea 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -195,9 +195,9 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
             # if the audio is too short, we just use the first chunk
             ranges[2] = [0]
         # randomly choose index for each part
-        idx_front = np.random.choice(ranges[0])  #172
-        idx_middle = np.random.choice(ranges[1]) #508
-        idx_back = np.random.choice(ranges[2])   #1039
+        idx_front = np.random.choice(ranges[0])  # 172
+        idx_middle = np.random.choice(ranges[1])  # 508
+        idx_back = np.random.choice(ranges[2])  # 1039
         # select mel
         mel_chunk_front = mel[idx_front : idx_front + chunk_frames, :]
         mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]

From 1b7b39d8d7445603a023a9418b9e6ad2082b6702 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 09:28:01 +0000
Subject: [PATCH 081/197] few fixes + make fixup

---
 .../models/clap/configuration_clap.py         |  11 +-
 src/transformers/models/clap/modeling_clap.py |  89 +++++++++++--
 tests/models/clap/test_modeling_clap.py       | 118 ++++++++++--------
 3 files changed, 153 insertions(+), 65 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index c6828d8d17e9..506656ab0431 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -112,6 +112,7 @@ def __init__(
         max_position_embeddings=514,
         type_vocab_size=1,
         initializer_range=0.02,
+        initializer_factor=1.0,
         layer_norm_eps=1e-12,
         projection_hidden_size=768,
         pad_token_id=1,
@@ -136,6 +137,7 @@ def __init__(
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
@@ -223,7 +225,6 @@ def __init__(
         patch_stride=(4, 4),
         num_classes=527,
         hidden_size=96,
-        embed_dim=96,
         projection_hidden_size=768,
         depths=[2, 2, 6, 2],
         num_heads=[4, 8, 16, 32],
@@ -242,7 +243,9 @@ def __init__(
         mlp_ratio=4.0,
         aff_block_r=4,
         enable_patch_fusion=False,
+        projection_hidden_act="relu",
         layer_norm_eps=1e-5,
+        initializer_factor=1.0,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -253,7 +256,6 @@ def __init__(
         self.patch_stride = patch_stride
         self.num_classes = num_classes
         self.hidden_size = hidden_size
-        self.embed_dim = embed_dim
         self.depths = depths
         self.num_heads = num_heads
         self.window_size = window_size
@@ -275,6 +277,8 @@ def __init__(
         self.aff_block_r = aff_block_r
         self.enable_patch_fusion = enable_patch_fusion
         self.layer_norm_eps = layer_norm_eps
+        self.initializer_factor = initializer_factor
+        self.projection_hidden_act = projection_hidden_act
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
@@ -350,6 +354,7 @@ def __init__(
         fusion_num_hidden_layers=2,
         projection_dim=512,
         projection_hidden_act="relu",
+        initializer_factor=1.0,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -387,7 +392,7 @@ def __init__(
         self.hidden_size = self.text_config.hidden_size
 
         self.logit_scale_init_value = logit_scale_init_value
-        self.initializer_factor = 1.0
+        self.initializer_factor = initializer_factor
 
     @classmethod
     def from_text_audio_configs(cls, text_config: CLAPTextConfig, audio_config: CLAPAudioConfig, **kwargs):
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 20b47b7c7b4b..ebe750b96a05 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -204,12 +204,19 @@ class CLAPAudioModelOutput(ModelOutput):
             Sequence of hidden-states at the output of the last layer of the model.
         embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
     """
 
     framewise_output: torch.FloatTensor = None
     clipwise_output: torch.FloatTensor = None
     fine_grained_embedding: torch.FloatTensor = None
     embedding: torch.FloatTensor = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    projection_output: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -877,7 +884,7 @@ def __init__(self, config):
             [
                 CLAPAudioStage(
                     config=config,
-                    dim=int(config.embed_dim * 2**i_layer),
+                    dim=int(config.hidden_size * 2**i_layer),
                     input_resolution=self.input_resolutions[i_layer],
                     depth=config.depths[i_layer],
                     num_heads=config.num_heads[i_layer],
@@ -953,6 +960,8 @@ def forward(
         always_partition: Optional[bool] = False,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, CLAPAudioModelOutput]:
+        # print(input_features.shape, self.enable_fusion)
+
         input_features = input_features.transpose(1, 3)
         hidden_states = self.bn0(input_features)
         hidden_states = hidden_states.transpose(1, 3)
@@ -972,7 +981,7 @@ def forward(
         all_reshaped_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
-        input_dimensions = None
+        input_dimensions = self.input_resolutions[0]
 
         if output_hidden_states:
             batch_size, _, hidden_size = hidden_states.shape
@@ -1078,6 +1087,8 @@ def custom_forward(*inputs):
             clipwise_output=torch.sigmoid(hidden_states),
             fine_grained_embedding=fine_grained_latent_output,
             embedding=latent_output,
+            attentions=all_self_attentions,
+            hidden_states=all_reshaped_hidden_states,
         )
 
 
@@ -1199,7 +1210,7 @@ def forward(self, hidden_states):
 
 
 class CLAPProjectionLayer(nn.Module):
-    def __init__(self, config: CLAPTextConfig):
+    def __init__(self, config: CLAPAudioConfig):
         super().__init__()
         self.config = config
         hidden_size = config.projection_hidden_size
@@ -1748,19 +1759,63 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return pooled_output
 
 
+# class CLAPPreTrainedModel(PreTrainedModel):
+#     """
+# An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained #
+models. #"""
+
+#     config_class = CLAPConfig
+#     base_model_prefix = "clap"
+#     supports_gradient_checkpointing = True
+#     _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"]
+
+#     def _init_weights(self, module):
+#         pass
+
+
 class CLAPPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
-    config_class = CLAPConfig
+    config_class = CLAPTextConfig
     base_model_prefix = "clap"
     supports_gradient_checkpointing = True
     _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"]
 
     def _init_weights(self, module):
-        pass
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLAPTextEmbeddings):
+            module.word_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.token_type_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLAPTextSelfAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
+            nn.init.normal_(module.query.weight, std=in_proj_std)
+            nn.init.normal_(module.key.weight, std=in_proj_std)
+            nn.init.normal_(module.value.weight, std=in_proj_std)
+        elif isinstance(module, (CLAPTextSelfOutput, CLAPTextOutput, CLAPTextIntermediate, CLAPTextPooler)):
+            factor = self.config.initializer_factor
+            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
+            nn.init.normal_(module.dense.weight, std=in_proj_std)
+        elif isinstance(module, CLAPProjectionLayer):
+            factor = self.config.initializer_factor
+            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
+            nn.init.normal_(module.linear1.weight, std=in_proj_std)
+            nn.init.normal_(module.linear2.weight, std=in_proj_std)
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLAPTextEncoder):
+            module.gradient_checkpointing = value
 
 
 class CLAPAudioModel(CLAPPreTrainedModel):
@@ -1808,6 +1863,10 @@ def forward(
         >>> pooled_output = outputs.pooler_output  # pooled CLS states
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         return self.audio_encoder(
             input_features=input_features,
@@ -1834,6 +1893,7 @@ class CLAPTextModel(CLAPPreTrainedModel):
 
     """
 
+    config_class = CLAPTextConfig
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->CLAPText
@@ -2216,10 +2276,10 @@ def __init__(self, config: CLAPTextConfig):
         self.post_init()
 
     def get_input_embeddings(self) -> nn.Module:
-        return self.text_model.embeddings.token_embedding
+        return self.text_model.embeddings.word_embeddings
 
     def set_input_embeddings(self, value):
-        self.text_model.embeddings.token_embedding = value
+        self.text_model.embeddings.word_embeddings = value
 
     @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CLAPTextModelOutput, config_class=CLAPTextConfig)
@@ -2290,8 +2350,7 @@ def __init__(self, config: CLAPAudioConfig):
 
         self.audio_model = CLAPAudioModel(config)
 
-        self.audio_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
-
+        self.audio_projection = CLAPProjectionLayer(config)
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -2329,6 +2388,10 @@ def forward(
         >>> audio_embeds = outputs.audio_embeds
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         audio_outputs = self.audio_model(
             input_features=input_features,
@@ -2337,7 +2400,7 @@ def forward(
             return_dict=return_dict,
         )
 
-        pooled_output = audio_outputs[1]  # pooled_output
+        pooled_output = audio_outputs[-1] if not return_dict else audio_outputs.embedding
 
         audio_embeds = self.audio_projection(pooled_output)
 
@@ -2346,8 +2409,10 @@ def forward(
             return tuple(output for output in outputs if output is not None)
 
         return CLAPAudioModelOutput(
-            audio_embeds=audio_embeds,
-            last_hidden_state=audio_outputs.last_hidden_state,
+            projection_output=audio_embeds,
+            clipwise_output=audio_outputs.clipwise_output,
+            framewise_output=audio_outputs.framewise_output,
+            embedding=audio_outputs.embedding,
             hidden_states=audio_outputs.hidden_states,
             attentions=audio_outputs.attentions,
         )
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 13592d11ca78..f1d85d634b35 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -79,14 +79,22 @@ def __init__(
         self,
         parent,
         batch_size=12,
-        image_size=30,
+        image_size=60,
+        mel_bins=16,
+        window_size=4,
+        spec_size=64,
         patch_size=2,
+        patch_stride=2,
+        seq_length=16,
+        freq_ratio=2,
         num_channels=3,
         is_training=True,
         hidden_size=32,
+        patch_embeds_hidden_size=32,
+        projection_hidden_size=256,
         projection_dim=32,
-        num_hidden_layers=5,
-        num_attention_heads=4,
+        num_hidden_layers=4,
+        num_heads=[2, 2, 2, 2],
         intermediate_size=37,
         dropout=0.1,
         attention_dropout=0.1,
@@ -96,74 +104,80 @@ def __init__(
         self.parent = parent
         self.batch_size = batch_size
         self.image_size = image_size
+        self.mel_bins = mel_bins
+        self.window_size = window_size
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.is_training = is_training
         self.hidden_size = hidden_size
         self.projection_dim = projection_dim
         self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
+        self.num_heads = num_heads
+        self.num_attention_heads = num_heads[0]
+        self.projection_hidden_size = projection_hidden_size
+        self.seq_length = seq_length
+        self.spec_size = spec_size
+        self.freq_ratio = freq_ratio
+        self.patch_stride = patch_stride
+        self.patch_embeds_hidden_size = patch_embeds_hidden_size
         self.intermediate_size = intermediate_size
         self.dropout = dropout
         self.attention_dropout = attention_dropout
         self.initializer_range = initializer_range
         self.scope = scope
 
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
     def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        input_features = floats_tensor([self.batch_size, 1, self.hidden_size, self.mel_bins])
         config = self.get_config()
 
-        return config, pixel_values
+        return config, input_features
 
     def get_config(self):
         return CLAPAudioConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
+            mel_bins=self.mel_bins,
+            window_size=self.window_size,
             num_channels=self.num_channels,
             hidden_size=self.hidden_size,
+            patch_stride=self.patch_stride,
             projection_dim=self.projection_dim,
             num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
+            num_heads=self.num_heads,
             intermediate_size=self.intermediate_size,
             dropout=self.dropout,
             attention_dropout=self.attention_dropout,
             initializer_range=self.initializer_range,
+            spec_size=self.spec_size,
+            freq_ratio=self.freq_ratio,
+            patch_embeds_hidden_size=self.patch_embeds_hidden_size,
+            projection_hidden_size=self.projection_hidden_size,
         )
 
-    def create_and_check_model(self, config, pixel_values):
+    def create_and_check_model(self, config, input_features):
         model = CLAPAudioModel(config=config)
         model.to(torch_device)
         model.eval()
         with torch.no_grad():
-            result = model(pixel_values)
+            result = model(input_features)
         # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+        embedding_shape = self.hidden_size * self.window_size * self.freq_ratio
+        self.parent.assertEqual(
+            result.fine_grained_embedding.shape, (self.batch_size, embedding_shape, embedding_shape)
+        )
 
-    def create_and_check_model_with_projection(self, config, pixel_values):
+    def create_and_check_model_with_projection(self, config, input_features):
         model = CLAPAudioModelWithProjection(config=config)
         model.to(torch_device)
         model.eval()
         with torch.no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim))
+            result = model(input_features)
+        self.parent.assertEqual(result.projection_output.shape, (self.batch_size, self.hidden_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
+        config, input_features = config_and_inputs
+        inputs_dict = {"input_features": input_features}
         return config, inputs_dict
 
 
@@ -209,7 +223,7 @@ def test_forward_signature(self):
             # signature.parameters is an OrderedDict => so arg_names order is deterministic
             arg_names = [*signature.parameters.keys()]
 
-            expected_arg_names = ["pixel_values"]
+            expected_arg_names = ["input_features"]
             self.assertListEqual(arg_names[:1], expected_arg_names)
 
     def test_model(self):
@@ -268,6 +282,7 @@ def __init__(
         max_position_embeddings=512,
         initializer_range=0.02,
         scope=None,
+        projection_hidden_act="relu",
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -286,6 +301,7 @@ def __init__(
         self.max_position_embeddings = max_position_embeddings
         self.initializer_range = initializer_range
         self.scope = scope
+        self.projection_hidden_act = projection_hidden_act
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -309,6 +325,7 @@ def get_config(self):
         return CLAPTextConfig(
             vocab_size=self.vocab_size,
             hidden_size=self.hidden_size,
+            projection_hidden_size=self.hidden_size,
             projection_dim=self.projection_dim,
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_attention_heads,
@@ -317,6 +334,7 @@ def get_config(self):
             attention_dropout=self.attention_dropout,
             max_position_embeddings=self.max_position_embeddings,
             initializer_range=self.initializer_range,
+            projection_hidden_act=self.projection_hidden_act,
         )
 
     def create_and_check_model(self, config, input_ids, input_mask):
@@ -401,48 +419,48 @@ def test_model_with_projection_from_pretrained(self):
 
 
 class CLAPModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+    def __init__(self, parent, text_kwargs=None, audio_kwargs=None, is_training=True):
         if text_kwargs is None:
             text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
+        if audio_kwargs is None:
+            audio_kwargs = {}
 
         self.parent = parent
         self.text_model_tester = CLAPTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = CLAPAudioModelTester(parent, **vision_kwargs)
+        self.audio_model_tester = CLAPAudioModelTester(parent, **audio_kwargs)
         self.is_training = is_training
 
     def prepare_config_and_inputs(self):
         text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        audio_config, input_features = self.audio_model_tester.prepare_config_and_inputs()
 
         config = self.get_config()
 
-        return config, input_ids, attention_mask, pixel_values
+        return config, input_ids, attention_mask, input_features
 
     def get_config(self):
-        return CLAPConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        return CLAPConfig.from_text_audio_configs(
+            self.text_model_tester.get_config(), self.audio_model_tester.get_config(), projection_dim=64
         )
 
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+    def create_and_check_model(self, config, input_ids, attention_mask, input_features):
         model = CLAPModel(config).to(torch_device).eval()
         with torch.no_grad():
-            result = model(input_ids, pixel_values, attention_mask)
+            result = model(input_ids, input_features, attention_mask)
         self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+            result.logits_per_image.shape, (self.audio_model_tester.batch_size, self.text_model_tester.batch_size)
         )
         self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.audio_model_tester.batch_size)
         )
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        config, input_ids, attention_mask, input_features = config_and_inputs
         inputs_dict = {
             "input_ids": input_ids,
             "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
+            "input_features": input_features,
             "return_loss": True,
         }
         return config, inputs_dict
@@ -518,8 +536,8 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             try:
                 input_ids = inputs_dict["input_ids"]
-                pixel_values = inputs_dict["pixel_values"]  # CLAP needs pixel_values
-                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+                input_features = inputs_dict["input_features"]  # CLAP needs input_features
+                traced_model = torch.jit.trace(model, (input_ids, input_features))
             except RuntimeError:
                 self.fail("Couldn't trace module.")
 
@@ -555,14 +573,14 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertTrue(models_equal)
 
-    def test_load_vision_text_config(self):
+    def test_load_audio_text_config(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         # Save CLAPConfig and check if we can load CLAPAudioConfig from it
         with tempfile.TemporaryDirectory() as tmp_dir_name:
             config.save_pretrained(tmp_dir_name)
-            vision_config = CLAPAudioConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+            audio_config = CLAPAudioConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.audio_config.to_dict(), audio_config.to_dict())
 
         # Save CLAPConfig and check if we can load CLAPTextConfig from it
         with tempfile.TemporaryDirectory() as tmp_dir_name:
@@ -723,11 +741,11 @@ def test_inference(self):
         # verify the logits
         self.assertEqual(
             outputs.logits_per_image.shape,
-            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+            torch.Size((inputs.input_features.shape[0], inputs.input_ids.shape[0])),
         )
         self.assertEqual(
             outputs.logits_per_text.shape,
-            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+            torch.Size((inputs.input_ids.shape[0], inputs.input_features.shape[0])),
         )
 
         expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)

From 552aee2cde5c135ad8d588f8b847409f0699a1c0 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 09:29:03 +0000
Subject: [PATCH 082/197] oops

---
 src/transformers/models/clap/modeling_clap.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index ebe750b96a05..cdd722ae9893 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1759,20 +1759,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return pooled_output
 
 
-# class CLAPPreTrainedModel(PreTrainedModel):
-#     """
-# An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained #
-models. #"""
-
-#     config_class = CLAPConfig
-#     base_model_prefix = "clap"
-#     supports_gradient_checkpointing = True
-#     _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"]
-
-#     def _init_weights(self, module):
-#         pass
-
-
 class CLAPPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained

From 05674c77b2e20c58043b09b7e0669ef45abf11c0 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 10:08:15 +0000
Subject: [PATCH 083/197] fix more tests

---
 .../models/clap/configuration_clap.py         |   6 +-
 src/transformers/models/clap/modeling_clap.py | 116 ++++++++++++------
 tests/models/clap/test_modeling_clap.py       |   8 +-
 3 files changed, 84 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 506656ab0431..79330dfee9f3 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -227,7 +227,7 @@ def __init__(
         hidden_size=96,
         projection_hidden_size=768,
         depths=[2, 2, 6, 2],
-        num_heads=[4, 8, 16, 32],
+        num_attention_heads=[4, 8, 16, 32],
         enable_fusion=False,
         hidden_dropout_prob=0.1,
         fusion_type=None,
@@ -257,7 +257,8 @@ def __init__(
         self.num_classes = num_classes
         self.hidden_size = hidden_size
         self.depths = depths
-        self.num_heads = num_heads
+        self.num_hidden_layers = len(depths)
+        self.num_attention_heads = num_attention_heads
         self.window_size = window_size
         self.enable_fusion = enable_fusion
         self.fusion_type = fusion_type
@@ -393,6 +394,7 @@ def __init__(
 
         self.logit_scale_init_value = logit_scale_init_value
         self.initializer_factor = initializer_factor
+        self.num_hidden_layers = self.text_config.num_hidden_layers + len(self.audio_config.depths)
 
     @classmethod
     def from_text_audio_configs(cls, text_config: CLAPTextConfig, audio_config: CLAPAudioConfig, **kwargs):
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index cdd722ae9893..022d7529d59c 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -119,9 +119,9 @@ def window_reverse(windows, window_size, height, width):
         window_size: (`int`)
             Window size
         height: (`int`)
-            Height of the resized image
+            Height of the resized audio
         width: (`int`)
-            Width of the resized image
+            Width of the resized audio
     """
     batch_size = int(windows.shape[0] / (height * width / window_size / window_size))
 
@@ -156,8 +156,8 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
 # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->clap
 def clap_loss(similarity: torch.Tensor) -> torch.Tensor:
     caption_loss = contrastive_loss(similarity)
-    image_loss = contrastive_loss(similarity.t())
-    return (caption_loss + image_loss) / 2.0
+    audio_loss = contrastive_loss(similarity.t())
+    return (caption_loss + audio_loss) / 2.0
 
 
 @dataclass
@@ -220,22 +220,22 @@ class CLAPAudioModelOutput(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLAP,vision->audio,Vision->Audio
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLAP,vision->audio,Vision->Audio,audio->audio
 class CLAPOutput(ModelOutput):
     """
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
-            Contrastive loss for image-text similarity.
-        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
-            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            Contrastive loss for audio-text similarity.
+        logits_per_audio:(`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
+            The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
             similarity scores.
-        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
-            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
             similarity scores.
         text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
             The text embeddings obtained by applying the projection layer to the pooled output of [`CLAPTextModel`].
-        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of [`CLAPAudioModel`].
+        audio_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The audio embeddings obtained by applying the projection layer to the pooled output of [`CLAPAudioModel`].
         text_model_output(`BaseModelOutputWithPooling`):
             The output of the [`CLAPTextModel`].
         audio_model_output(`BaseModelOutputWithPooling`):
@@ -243,10 +243,10 @@ class CLAPOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
+    logits_per_audio: torch.FloatTensor = None
     logits_per_text: torch.FloatTensor = None
     text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    audio_embeds: torch.FloatTensor = None
     text_model_output: BaseModelOutputWithPooling = None
     audio_model_output: BaseModelOutputWithPooling = None
 
@@ -379,7 +379,7 @@ def forward(self, hidden_states, is_longer_idx=None):
 
             if height != self.img_size[0] or width != self.img_size[1]:
                 raise ValueError(
-                    f"Input image size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+                    f"Input audio size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
                 )
 
             global_hidden_states = self.proj(global_hidden_states)
@@ -428,7 +428,7 @@ def forward(self, hidden_states, is_longer_idx=None):
             _, _, height, width = hidden_states.shape
             if height != self.img_size[0] or width != self.img_size[1]:
                 raise ValueError(
-                    f"Input image size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+                    f"Input audio size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
                 )
             hidden_states = self.proj(hidden_states)
 
@@ -887,7 +887,7 @@ def __init__(self, config):
                     dim=int(config.hidden_size * 2**i_layer),
                     input_resolution=self.input_resolutions[i_layer],
                     depth=config.depths[i_layer],
-                    num_heads=config.num_heads[i_layer],
+                    num_heads=config.num_attention_heads[i_layer],
                     drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                     downsample=CLAPAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
                 )
@@ -1767,12 +1767,13 @@ class CLAPPreTrainedModel(PreTrainedModel):
 
     config_class = CLAPTextConfig
     base_model_prefix = "clap"
-    supports_gradient_checkpointing = True
+    supports_gradient_checkpointing = False
     _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
         factor = self.config.initializer_factor
+
         if isinstance(module, CLAPTextEmbeddings):
             module.word_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
             module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
@@ -1783,7 +1784,18 @@ def _init_weights(self, module):
             nn.init.normal_(module.query.weight, std=in_proj_std)
             nn.init.normal_(module.key.weight, std=in_proj_std)
             nn.init.normal_(module.value.weight, std=in_proj_std)
-        elif isinstance(module, (CLAPTextSelfOutput, CLAPTextOutput, CLAPTextIntermediate, CLAPTextPooler)):
+        elif isinstance(
+            module,
+            (
+                CLAPTextSelfOutput,
+                CLAPTextOutput,
+                CLAPTextIntermediate,
+                CLAPTextPooler,
+                CLAPAudioSelfOutput,
+                CLAPAudioIntermediate,
+                CLAPAudioOutput,
+            ),
+        ):
             factor = self.config.initializer_factor
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.dense.weight, std=in_proj_std)
@@ -1792,12 +1804,36 @@ def _init_weights(self, module):
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.linear1.weight, std=in_proj_std)
             nn.init.normal_(module.linear2.weight, std=in_proj_std)
+        elif isinstance(module, CLAPAudioPatchEmbed):
+            factor = self.config.initializer_factor
+            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
+            nn.init.normal_(module.proj.weight, std=in_proj_std)
+        elif isinstance(module, CLAPAudioSelfAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
+            nn.init.normal_(module.query.weight, std=in_proj_std)
+            nn.init.normal_(module.key.weight, std=in_proj_std)
+            nn.init.normal_(module.value.weight, std=in_proj_std)
+        elif isinstance(module, CLAPAudioPatchMerging):
+            factor = self.config.initializer_factor
+            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
+            nn.init.normal_(module.reduction.weight, std=in_proj_std)
+        elif isinstance(module, CLAPAudioEncoder):
+            factor = self.config.initializer_factor
+            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
+            nn.init.normal_(module.head.weight, std=in_proj_std)
 
         if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
+        if isinstance(module, nn.Conv2d):
+            factor = self.config.initializer_factor
+            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
+            nn.init.normal_(module.weight, std=in_proj_std)
+            if module.bias is not None:
+                module.bias.data.zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, CLAPTextEncoder):
@@ -1815,7 +1851,7 @@ def __init__(self, config: CLAPAudioConfig):
         self.post_init()
 
     def get_input_embeddings(self) -> nn.Module:
-        return self.audio_encoder.embeddings.patch_embedding
+        return self.audio_encoder.patch_embed.proj
 
     @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLAPAudioConfig)
@@ -1839,10 +1875,10 @@ def forward(
         >>> model = CLAPAudioModel.from_pretrained("laionai/clap-hsat-tiny")
         >>> processor = AutoProcessor.from_pretrained("laionai/clap-hsat-tiny")
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = "http://audios.cocodataset.org/val2017/000000039769.jpg"
+        >>> audio = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> inputs = processor(audios=audio, return_tensors="pt")
 
         >>> outputs = model(**inputs)
         >>> last_hidden_state = outputs.last_hidden_state
@@ -2155,7 +2191,7 @@ def get_audio_features(
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
-        input_values: Optional[torch.FloatTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         return_loss: Optional[bool] = None,
@@ -2176,16 +2212,16 @@ def forward(
         >>> model = CLAPModel.from_pretrained("laion-ai/clap-htst-unfused-base")
         >>> processor = AutoProcessor.from_pretrained("laion-ai/clap-htst-unfused-base")
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> url = "http://audios.cocodataset.org/val2017/000000039769.jpg"
         >>> # TODO audio here
 
         >>> inputs = processor(
-        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ...     text=["a photo of a cat", "a photo of a dog"], audios=audio, return_tensors="pt", padding=True
         ... )
 
         >>> outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
-        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
+        >>> probs = logits_per_audio.softmax(dim=1)  # we can take the softmax to get the label probabilities
         ```"""
         # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -2195,8 +2231,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         audio_outputs = self.audio_model(
-            input_values=input_values,
-            attention_mask=attention_mask,
+            input_features=input_features,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
@@ -2211,7 +2246,7 @@ def forward(
             return_dict=return_dict,
         )
 
-        audio_embeds = audio_outputs[1]
+        audio_embeds = audio_outputs[-1] if not return_dict else audio_outputs.embedding
         audio_embeds = self.audio_projection(audio_embeds)
 
         text_embeds = text_outputs[1]
@@ -2222,21 +2257,22 @@ def forward(
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
 
         # cosine similarity as logits
-        logit_scale = self.logit_scale.exp()
-        logits_per_text = torch.matmul(text_embeds, audio_embeds.t()) * logit_scale
-        logits_per_image = logits_per_text.t()
+        logit_scale_text = self.logit_scale_t.exp()
+        logit_scale_audio = self.logit_scale_a.exp()
+        logits_per_text = torch.matmul(text_embeds, audio_embeds.t()) * logit_scale_text
+        logits_per_audio = torch.matmul(audio_embeds, text_embeds.t()) * logit_scale_audio
 
         loss = None
         if return_loss:
             loss = clap_loss(logits_per_text)
 
         if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, audio_embeds, text_outputs, audio_outputs)
+            output = (logits_per_audio, logits_per_text, text_embeds, audio_embeds, text_outputs, audio_outputs)
             return ((loss,) + output) if loss is not None else output
 
         return CLAPOutput(
             loss=loss,
-            logits_per_image=logits_per_image,
+            logits_per_audio=logits_per_audio,
             logits_per_text=logits_per_text,
             text_embeds=text_embeds,
             audio_embeds=audio_embeds,
@@ -2341,7 +2377,7 @@ def __init__(self, config: CLAPAudioConfig):
         self.post_init()
 
     def get_input_embeddings(self) -> nn.Module:
-        return self.audio_model.embeddings.patch_embedding
+        return self.audio_model.audio_encoder.patch_embed.proj
 
     @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CLAPAudioModelOutput, config_class=CLAPAudioConfig)
@@ -2365,10 +2401,10 @@ def forward(
         >>> model = CLAPAudioModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
         >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = "http://audios.cocodataset.org/val2017/000000039769.jpg"
+        >>> audio = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> inputs = processor(audios=audio, return_tensors="pt")
 
         >>> outputs = model(**inputs)
         >>> audio_embeds = outputs.audio_embeds
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index f1d85d634b35..591ba8d2ca6e 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -143,7 +143,7 @@ def get_config(self):
             patch_stride=self.patch_stride,
             projection_dim=self.projection_dim,
             num_hidden_layers=self.num_hidden_layers,
-            num_heads=self.num_heads,
+            num_attention_heads=self.num_heads,
             intermediate_size=self.intermediate_size,
             dropout=self.dropout,
             attention_dropout=self.attention_dropout,
@@ -448,7 +448,7 @@ def create_and_check_model(self, config, input_ids, attention_mask, input_featur
         with torch.no_grad():
             result = model(input_ids, input_features, attention_mask)
         self.parent.assertEqual(
-            result.logits_per_image.shape, (self.audio_model_tester.batch_size, self.text_model_tester.batch_size)
+            result.logits_per_audio.shape, (self.audio_model_tester.batch_size, self.text_model_tester.batch_size)
         )
         self.parent.assertEqual(
             result.logits_per_text.shape, (self.text_model_tester.batch_size, self.audio_model_tester.batch_size)
@@ -740,7 +740,7 @@ def test_inference(self):
 
         # verify the logits
         self.assertEqual(
-            outputs.logits_per_image.shape,
+            outputs.logits_per_audio.shape,
             torch.Size((inputs.input_features.shape[0], inputs.input_ids.shape[0])),
         )
         self.assertEqual(
@@ -750,4 +750,4 @@ def test_inference(self):
 
         expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
 
-        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
+        self.assertTrue(torch.allclose(outputs.logits_per_audio, expected_logits, atol=1e-3))

From 74d3c4fef9ef916e0e3a8d61ceefc4052dec7505 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 8 Feb 2023 13:54:42 +0000
Subject: [PATCH 084/197] add zero shot audio classification pipeline

---
 docs/source/en/main_classes/pipelines.mdx     |   5 +
 src/transformers/__init__.py                  |   2 +
 .../feature_extraction_sequence_utils.py      |  66 ++---
 src/transformers/pipelines/__init__.py        |  15 +-
 .../zero_shot_audio_classification.py         | 139 +++++++++++
 ...ipelines_zero_shot_audio_classification.py | 229 ++++++++++++++++++
 6 files changed, 425 insertions(+), 31 deletions(-)
 create mode 100644 src/transformers/pipelines/zero_shot_audio_classification.py
 create mode 100644 tests/pipelines/test_pipelines_zero_shot_audio_classification.py

diff --git a/docs/source/en/main_classes/pipelines.mdx b/docs/source/en/main_classes/pipelines.mdx
index e5ee3902028e..96bae3530e2c 100644
--- a/docs/source/en/main_classes/pipelines.mdx
+++ b/docs/source/en/main_classes/pipelines.mdx
@@ -314,6 +314,11 @@ Pipelines available for audio tasks include the following.
     - __call__
     - all
 
+### ZeroShotAudioClassificationPipeline
+
+[[autodoc]] ZeroShotAudioClassificationPipeline
+    - __call__
+    - all
 ## Computer vision
 
 Pipelines available for computer vision tasks include the following.
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4c9e96acbb6b..7142c884c5c9 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -544,6 +544,7 @@
         "ZeroShotClassificationPipeline",
         "ZeroShotImageClassificationPipeline",
         "ZeroShotObjectDetectionPipeline",
+        "ZeroShotAudioClassificationPipeline",
         "pipeline",
     ],
     "processing_utils": ["ProcessorMixin"],
@@ -4005,6 +4006,7 @@
         TranslationPipeline,
         VideoClassificationPipeline,
         VisualQuestionAnsweringPipeline,
+        ZeroShotAudioClassificationPipeline,
         ZeroShotClassificationPipeline,
         ZeroShotImageClassificationPipeline,
         ZeroShotObjectDetectionPipeline,
diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 41069e536d96..8a23cf9727a5 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -309,12 +309,14 @@ def _truncate(
             processed_features:
                 Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
                 of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
-            max_length: maximum length of the returned list and optionally padding length (see below)
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+            max_length:
+                maximum length of the returned list and optionally padding length (see below)
+            pad_to_multiple_of (optional) :
+                Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            truncation:
-                (optional) Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            truncation (optional):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
         """
         if not truncation:
             return processed_features
@@ -409,11 +411,14 @@ def mel_to_hz(mels: np.array, mel_scale: str = "htk") -> np.array:
         """Convert mel bin numbers to frequencies.
 
         Args:
-            mels (np.array): Mel frequencies
-            mel_scale (str, optional): Scale to use: `htk` or `slaney`. (Default: `htk`)
+            mels (np.array):
+                Mel frequencies
+            mel_scale (str, *optional*, `"htk"`):
+                Scale to use: `htk` or `slaney`.
 
         Returns:
-            freqs (np.array): Mels converted in Hz
+            freqs (np.array):
+                Mels converted in Hz
         """
 
         if mel_scale not in ["slaney", "htk"]:
@@ -445,13 +450,16 @@ def create_triangular_filterbank(
         """Create a triangular filter bank.
 
         Args:
-            all_freqs (np.array): STFT freq points of size (`n_freqs`).
-            f_pts (np.array): Filter mid points of size (`n_filter`).
+            all_freqs (`np.array`):
+                STFT freq points of size (`n_freqs`).
+            f_pts (`np.array`):
+                Filter mid points of size (`n_filter`).
 
         Returns:
-            fb (np.array): The filter bank of size (`n_freqs`, `n_filter`).
+            fb (np.array):
+                The filter bank of size (`n_freqs`, `n_filter`).
         """
-        # Adopted from Librosa
+        # Adapted from Librosa
         # calculate the difference between each filter mid point and each stft freq point in hertz
         f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
         slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1)  # (n_freqs, n_filter + 2)
@@ -474,17 +482,16 @@ def get_mel_filter_banks(
         mel_scale: str = "htk",
     ) -> np.array:
         """
-        Create a frequency bin conversion matrix used to obtain the Mel Frequency Cepstral Coefficient. This is called
+        Create a frequency bin conversion matrix used to obtain the Mel Spectrogram. This is called
         a *mel filter bank*, and various implementation exist, which differ in the number of filters, the shape of the
         filters, the way the filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is
         warped. The goal of these features is to approximate the non-linear human perception of the variation in pitch
-        with respect to the frequency. This code is heavily inspired from the *torchaudio* implementation, refer to XXX
+        with respect to the frequency. This code is heavily inspired from the *torchaudio* implementation, see [here](https://pytorch.org/audio/stable/transforms.html)
         for more details.
 
 
         Note:
-            We will try to specify which variation correspond to which MFCCs from the litterature. The main features
-            are:
+            Different banks of MEL filters were introduced in the litterature. The following variation are supported:
                 - MFCC FB-20: introduced in 1980 by Davis and Mermelstein [4]; Davis and Mermelstein assume sampling
                   frequency of 10 kHz; speech bandwidth [0, 4600] Hz
                 - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) described in Young, 1995 [5]; Young uses a
@@ -496,20 +503,19 @@ def get_mel_filter_banks(
 
 
         Args:
-            n_freqs (int):
-                Number of frequencies to highlight/apply
-            frequency_min (float):
-                Minimum frequency (Hz)
-            frequency_max (float):
-                Maximum frequency (Hz)
-            n_mels (int):
-                Number of mel filterbanks
-            sample_rate (int):
+            n_freqs (`int`):
+                Number of frequencies to highlight/apply.
+            frequency_min (`float`):
+                Minimum frequency of interest(Hz).
+            frequency_max (`float`):
+                Maximum frequency of interest(Hz).
+            n_mels (`int`):
+                Number of mel filterbanks.
+            sample_rate (`int`):
                 Sample rate of the audio waveform
-            norm (str or None, optional):
+            norm (`str`, *optional*):
                 If "slaney", divide the triangular mel weights by the width of the mel band (area normalization).
-                (Default: `None`)
-            mel_scale (str, optional):
+            mel_scale (`str`, *optional*, `"htk"`):
                 Scale to use: `htk` or `slaney`. (Default: `htk`)
 
         Returns:
@@ -552,7 +558,7 @@ def get_mel_filter_banks(
     def _stft(self, frames, window):
         """
         Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
-        results as `torch.stft`.
+        results as `torch.stft`. #TODO @Arthur batching this could alloz more usage, good first issue.
 
         Args:
             frames (`np.array` of dimension `(num_frames, self.n_fft)`):
@@ -587,7 +593,7 @@ def _stft(self, frames, window):
 
     def _power_to_db(self, mel_spectrogram, a_min=1e-10, ref=1.0):
         """
-        Convert a mel spectrogram from power to db, this function is the numpy implementation of librosa.power_to_lb.
+        Convert a mel spectrogram from power to db scale, this function is the numpy implementation of librosa.power_to_lb.
         """
         log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
         log_spec -= 10.0 * np.log10(np.maximum(a_min, ref))
@@ -605,7 +611,7 @@ def _fram_wave(self, waveform: np.array, center: bool = True):
         The window length (self.window_length) defines how much of the signal is contained in each frame, while the hop
         length defines the step between the beginning of each new frame.
 
-        **This method does not support batching yet as we are mainly focus on inference. If you want this to be added
+        #TODO @Arthur **This method does not support batching yet as we are mainly focus on inference. If you want this to be added
         feel free to open an issue and ping @arthurzucker on Github**
 
         Args:
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 434009d7f293..1a6002f27bd4 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -86,7 +86,7 @@
 from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
 from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
 from .zero_shot_object_detection import ZeroShotObjectDetectionPipeline
-
+from .zero_shot_audio_classification import ZeroShotAudioClassificationPipeline
 
 if is_tf_available():
     import tensorflow as tf
@@ -304,6 +304,18 @@
         },
         "type": "multimodal",
     },
+    "zero-shot-audio-classification": {
+        "impl": ZeroShotAudioClassificationPipeline,
+        "tf": (TFAutoModel,) if is_tf_available() else (),
+        "pt": (AutoModel,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("laion-ai/clap-hsat-tiny", "f4881ba"),
+                "tf": ("laion-ai/clip-hsat-tiny", "f4881ba"),
+            }
+        },
+        "type": "multimodal",
+    },
     "conversational": {
         "impl": ConversationalPipeline,
         "tf": (TFAutoModelForSeq2SeqLM, TFAutoModelForCausalLM) if is_tf_available() else (),
@@ -537,6 +549,7 @@ def pipeline(
             - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
             - `"video-classification"`: will return a [`VideoClassificationPipeline`].
             - `"visual-question-answering"`: will return a [`VisualQuestionAnsweringPipeline`].
+            - `"zero-shot-audio-classification"`: will return a [`ZeroShotAudioClassificationPipeline`].
             - `"zero-shot-classification"`: will return a [`ZeroShotClassificationPipeline`].
             - `"zero-shot-image-classification"`: will return a [`ZeroShotImageClassificationPipeline`].
             - `"zero-shot-object-detection"`: will return a [`ZeroShotObjectDetectionPipeline`].
diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
new file mode 100644
index 000000000000..6b616ee46716
--- /dev/null
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -0,0 +1,139 @@
+from typing import List, Union
+
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    logging,
+    requires_backends,
+)
+from .base import PIPELINE_INIT_ARGS, ChunkPipeline
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(PIPELINE_INIT_ARGS)
+class ZeroShotAudioClassificationPipeline(ChunkPipeline):
+    """
+    Zero shot audio classification pipeline using `CLAPModel`. This pipeline predicts the class of an audio when you
+    provide an audio and a set of `candidate_labels`.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="openai/clap-vit-large-patch14")
+    >>> classifier(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     candidate_labels=["animals", "humans", "landscape"],
+    ... )
+    [{'score': 0.965, 'label': 'animals'}, {'score': 0.03, 'label': 'humans'}, {'score': 0.005, 'label': 'landscape'}]
+
+    >>> classifier(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     candidate_labels=["black and white", "photorealist", "painting"],
+    ... )
+    [{'score': 0.996, 'label': 'black and white'}, {'score': 0.003, 'label': 'photorealist'}, {'score': 0.0, 'label': 'painting'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This audio classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-audio-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        requires_backends(self, "audio")
+        # No specific FOR_XXX available yet
+        # self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING)
+
+    def __call__(self, audios: Union[str, List[str], "Image", List["Image"]], **kwargs):
+        """
+        Assign labels to the audio(s) passed as inputs.
+
+        Args:
+            audios (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of audios:
+
+                - A string containing a http link pointing to an audio
+                - A string containing a local path to an audio
+                - An audio loaded in PIL directly
+
+            candidate_labels (`List[str]`):
+                The candidate labels for this audio
+
+            hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`):
+                The sentence used in cunjunction with *candidate_labels* to attempt the audio classification by
+                replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
+                logits_per_image
+
+        Return:
+            A list of dictionaries containing result, one dictionary per proposed label. The dictionaries contain the
+            following keys:
+
+            - **label** (`str`) -- The label identified by the model. It is one of the suggested `candidate_label`.
+            - **score** (`float`) -- The score attributed by the model for that label (between 0 and 1).
+        """
+        return super().__call__(audios, **kwargs)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "candidate_labels" in kwargs:
+            preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+        if "hypothesis_template" in kwargs:
+            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+
+        return preprocess_params, {}, {}
+
+    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a photo of {}."):
+        n = len(candidate_labels)
+        for i, candidate_label in enumerate(candidate_labels):
+            audio = load_audio(audio)
+            audios = self.image_processor(audios=[audio], return_tensors=self.framework)
+            sequence = hypothesis_template.format(candidate_label)
+            inputs = self.tokenizer(sequence, return_tensors=self.framework)
+            inputs["input_features"] = audios.input_features
+            yield {"is_last": i == n - 1, "candidate_label": candidate_label, **inputs}
+
+    def _forward(self, model_inputs):
+        is_last = model_inputs.pop("is_last")
+        candidate_label = model_inputs.pop("candidate_label")
+        outputs = self.model(**model_inputs)
+
+        # Clip does crossproduct scoring by default, so we're only
+        # interested in the results where audio and text and in the same
+        # batch position.
+        diag = torch.diagonal if self.framework == "pt" else tf.linalg.diag_part
+        logits_per_image = diag(outputs.logits_per_image)
+
+        model_outputs = {
+            "is_last": is_last,
+            "candidate_label": candidate_label,
+            "logits_per_image": logits_per_image,
+        }
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
+        if self.framework == "pt":
+            logits = torch.cat([output["logits_per_image"] for output in model_outputs])
+            probs = logits.softmax(dim=0)
+            scores = probs.tolist()
+        else:
+            raise ValueError("`tf` framework not supported.")
+
+        result = [
+            {"score": score, "label": candidate_label}
+            for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+        ]
+        return result
diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
new file mode 100644
index 000000000000..a5c1e9ecd44e
--- /dev/null
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -0,0 +1,229 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_audio_available
+from transformers.pipelines import pipeline
+from transformers.testing_utils import nested_simplify, require_audio, require_tf, require_torch, slow
+
+from .test_pipelines_common import ANY, PipelineTestCaseMeta
+
+
+if is_audio_available():
+    pass
+else:
+
+    class Audio:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@require_audio
+class ZeroShotAudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+    # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping,
+    # and only CLAP would be there for now.
+    # model_mapping = {CLAPConfig: CLAPModel}
+
+    # def get_test_pipeline(self, model, tokenizer, processor):
+    #     if tokenizer is None:
+    #         # Side effect of no Fast Tokenizer class for these model, so skipping
+    #         # But the slow tokenizer test should still run as they're quite small
+    #         self.skipTest("No tokenizer available")
+    #         return
+    #         # return None, None
+
+    #     audio_classifier = ZeroShotAudioClassificationPipeline(
+    #         model=model, tokenizer=tokenizer, feature_extractor=processor
+    #     )
+
+    #     # test with a raw waveform
+    #     audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    #     audio2 = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    #     return audio_classifier, [audio, audio2]
+
+    # def run_pipeline_test(self, pipe, examples):
+    #     audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    #     outputs = pipe(audio, candidate_labels=["A", "B"])
+    #     self.assertEqual(outputs, {"text": ANY(str)})
+
+    #     # Batching
+    #     outputs = pipe([audio] * 3, batch_size=2, candidate_labels=["A", "B"])
+
+    @require_torch
+    def test_small_model_pt(self):
+        audio_classifier = pipeline(
+            model="hf-internal-testing/tiny-random-clap-zero-shot-audio-classification",
+        )
+        audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        output = audio_classifier(audio, candidate_labels=["a", "b", "c"])
+
+        self.assertEqual(
+            nested_simplify(output),
+            [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
+        )
+
+        output = audio_classifier([audio] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
+        self.assertEqual(
+            nested_simplify(output),
+            # Pipeline outputs are supposed to be deterministic and
+            # So we could in theory have real values "A", "B", "C" instead
+            # of ANY(str).
+            # However it seems that in this particular case, the floating
+            # scores are so close, we enter floating error approximation
+            # and the order is not guaranteed anymore with batching.
+            [
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+            ],
+        )
+
+    @require_tf
+    def test_small_model_tf(self):
+        audio_classifier = pipeline(
+            model="hf-internal-testing/tiny-random-clap-zero-shot-audio-classification", framework="tf"
+        )
+        audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        output = audio_classifier(audio, candidate_labels=["a", "b", "c"])
+
+        self.assertEqual(
+            nested_simplify(output),
+            [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
+        )
+
+        output = audio_classifier([audio] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
+        self.assertEqual(
+            nested_simplify(output),
+            # Pipeline outputs are supposed to be deterministic and
+            # So we could in theory have real values "A", "B", "C" instead
+            # of ANY(str).
+            # However it seems that in this particular case, the floating
+            # scores are so close, we enter floating error approximation
+            # and the order is not guaranteed anymore with batching.
+            [
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_large_model_pt(self):
+        audio_classifier = pipeline(
+            task="zero-shot-audio-classification",
+            model="openai/clap-vit-base-patch32",
+        )
+        # This is an audio of 2 cats with remotes and no planes
+        audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        output = audio_classifier(audio, candidate_labels=["cat", "plane", "remote"])
+
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"score": 0.511, "label": "remote"},
+                {"score": 0.485, "label": "cat"},
+                {"score": 0.004, "label": "plane"},
+            ],
+        )
+
+        output = audio_classifier([audio] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                [
+                    {"score": 0.511, "label": "remote"},
+                    {"score": 0.485, "label": "cat"},
+                    {"score": 0.004, "label": "plane"},
+                ],
+            ]
+            * 5,
+        )
+
+    @slow
+    @require_tf
+    def test_large_model_tf(self):
+        audio_classifier = pipeline(
+            task="zero-shot-audio-classification", model="openai/clap-vit-base-patch32", framework="tf"
+        )
+        # This is an audio of 2 cats with remotes and no planes
+        audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        output = audio_classifier(audio, candidate_labels=["cat", "plane", "remote"])
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"score": 0.511, "label": "remote"},
+                {"score": 0.485, "label": "cat"},
+                {"score": 0.004, "label": "plane"},
+            ],
+        )
+
+        output = audio_classifier([audio] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                [
+                    {"score": 0.511, "label": "remote"},
+                    {"score": 0.485, "label": "cat"},
+                    {"score": 0.004, "label": "plane"},
+                ],
+            ]
+            * 5,
+        )

From cad3f4277719b3951206bb1ea6cb42872ea1bb95 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 8 Feb 2023 15:30:09 +0000
Subject: [PATCH 085/197] update zeroshot classification pipeline

---
 .../zero_shot_audio_classification.py         | 72 ++++++++++++++-----
 1 file changed, 54 insertions(+), 18 deletions(-)

diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
index 6b616ee46716..d9a81c16d32b 100644
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -7,8 +7,9 @@
     requires_backends,
 )
 from .base import PIPELINE_INIT_ARGS, ChunkPipeline
-
-
+from .audio_classification import ffmpeg_read
+import requests
+import numpy as np
 if is_torch_available():
     import torch
 
@@ -27,15 +28,15 @@ class ZeroShotAudioClassificationPipeline(ChunkPipeline):
     ```python
     >>> from transformers import pipeline
 
-    >>> classifier = pipeline(model="openai/clap-vit-large-patch14")
+    >>> classifier = pipeline(model="laion-ai/clap-hsat-tiny")
     >>> classifier(
-    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
     ...     candidate_labels=["animals", "humans", "landscape"],
     ... )
     [{'score': 0.965, 'label': 'animals'}, {'score': 0.03, 'label': 'humans'}, {'score': 0.005, 'label': 'landscape'}]
 
     >>> classifier(
-    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
     ...     candidate_labels=["black and white", "photorealist", "painting"],
     ... )
     [{'score': 0.996, 'label': 'black and white'}, {'score': 0.003, 'label': 'photorealist'}, {'score': 0.0, 'label': 'painting'}]
@@ -54,20 +55,27 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
         requires_backends(self, "audio")
+        
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
         # No specific FOR_XXX available yet
         # self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING)
 
-    def __call__(self, audios: Union[str, List[str], "Image", List["Image"]], **kwargs):
+    def __call__(
+        self,
+        audios: Union[np.ndarray, bytes, str],
+        **kwargs,
+    ):
         """
         Assign labels to the audio(s) passed as inputs.
 
         Args:
-            audios (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
-                The pipeline handles three types of audios:
+            audios (`str`, `List[str]`, `np.array` or `List[np.array]`):
+                The pipeline handles three types of inputs:
 
                 - A string containing a http link pointing to an audio
                 - A string containing a local path to an audio
-                - An audio loaded in PIL directly
+                - An audio loaded in numpy
 
             candidate_labels (`List[str]`):
                 The candidate labels for this audio
@@ -75,7 +83,7 @@ def __call__(self, audios: Union[str, List[str], "Image", List["Image"]], **kwar
             hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`):
                 The sentence used in cunjunction with *candidate_labels* to attempt the audio classification by
                 replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
-                logits_per_image
+                logits_per_audio
 
         Return:
             A list of dictionaries containing result, one dictionary per proposed label. The dictionaries contain the
@@ -95,11 +103,39 @@ def _sanitize_parameters(self, **kwargs):
 
         return preprocess_params, {}, {}
 
-    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a photo of {}."):
+    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a recording of {}."):
+        n = len(candidate_labels)
+        for i, candidate_label in enumerate(candidate_labels):
+            audio = ffmpeg_read(audio)
+            audios = self.feature_extractor(audios=[audio], return_tensors=self.framework)
+            sequence = hypothesis_template.format(candidate_label)
+            inputs = self.tokenizer(sequence, return_tensors=self.framework)
+            inputs["input_features"] = audios.input_features
+            yield {"is_last": i == n - 1, "candidate_label": candidate_label, **inputs}
+            
+    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a recording of {}."):
+        if isinstance(audio, str):
+            if audio.startswith("http://") or audio.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                audio = requests.get(audio).content
+            else:
+                with open(audio, "rb") as f:
+                    audio = f.read()
+
+        if isinstance(audio, bytes):
+            audio = ffmpeg_read(audio, self.feature_extractor.sampling_rate)
+
+        if not isinstance(audio, np.ndarray):
+            raise ValueError("We expect a numpy ndarray as input")
+        if len(audio.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
+        
         n = len(candidate_labels)
         for i, candidate_label in enumerate(candidate_labels):
-            audio = load_audio(audio)
-            audios = self.image_processor(audios=[audio], return_tensors=self.framework)
+            audios = self.feature_extractor(
+                audio, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+            )
             sequence = hypothesis_template.format(candidate_label)
             inputs = self.tokenizer(sequence, return_tensors=self.framework)
             inputs["input_features"] = audios.input_features
@@ -110,23 +146,23 @@ def _forward(self, model_inputs):
         candidate_label = model_inputs.pop("candidate_label")
         outputs = self.model(**model_inputs)
 
-        # Clip does crossproduct scoring by default, so we're only
+        # CLAP does crossproduct scoring by default, so we're only
         # interested in the results where audio and text and in the same
         # batch position.
-        diag = torch.diagonal if self.framework == "pt" else tf.linalg.diag_part
-        logits_per_image = diag(outputs.logits_per_image)
+        diag = torch.diagonal
+        logits_per_audio = diag(outputs.logits_per_audio)
 
         model_outputs = {
             "is_last": is_last,
             "candidate_label": candidate_label,
-            "logits_per_image": logits_per_image,
+            "logits_per_audio": logits_per_audio,
         }
         return model_outputs
 
     def postprocess(self, model_outputs):
         candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
         if self.framework == "pt":
-            logits = torch.cat([output["logits_per_image"] for output in model_outputs])
+            logits = torch.cat([output["logits_per_audio"] for output in model_outputs])
             probs = logits.softmax(dim=0)
             scores = probs.tolist()
         else:

From f0d2194aeb0a55afe635763fb0c639d370f96b59 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 8 Feb 2023 15:33:43 +0000
Subject: [PATCH 086/197] fixup

---
 src/transformers/__init__.py                  |  2 +-
 .../feature_extraction_sequence_utils.py      | 25 ++++++++++---------
 .../zero_shot_audio_classification.py         | 25 +++++++------------
 3 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7142c884c5c9..45e61b4b3154 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -541,10 +541,10 @@
         "TranslationPipeline",
         "VideoClassificationPipeline",
         "VisualQuestionAnsweringPipeline",
+        "ZeroShotAudioClassificationPipeline",
         "ZeroShotClassificationPipeline",
         "ZeroShotImageClassificationPipeline",
         "ZeroShotObjectDetectionPipeline",
-        "ZeroShotAudioClassificationPipeline",
         "pipeline",
     ],
     "processing_utils": ["ProcessorMixin"],
diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 8a23cf9727a5..8128f194aa72 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -312,9 +312,9 @@ def _truncate(
             max_length:
                 maximum length of the returned list and optionally padding length (see below)
             pad_to_multiple_of (optional) :
-                Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+                Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
+                enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
+                which benefit from having sequence lengths be a multiple of 128.
             truncation (optional):
                 Activates truncation to cut input sequences longer than `max_length` to `max_length`.
         """
@@ -482,12 +482,12 @@ def get_mel_filter_banks(
         mel_scale: str = "htk",
     ) -> np.array:
         """
-        Create a frequency bin conversion matrix used to obtain the Mel Spectrogram. This is called
-        a *mel filter bank*, and various implementation exist, which differ in the number of filters, the shape of the
-        filters, the way the filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is
-        warped. The goal of these features is to approximate the non-linear human perception of the variation in pitch
-        with respect to the frequency. This code is heavily inspired from the *torchaudio* implementation, see [here](https://pytorch.org/audio/stable/transforms.html)
-        for more details.
+        Create a frequency bin conversion matrix used to obtain the Mel Spectrogram. This is called a *mel filter
+        bank*, and various implementation exist, which differ in the number of filters, the shape of the filters, the
+        way the filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The
+        goal of these features is to approximate the non-linear human perception of the variation in pitch with respect
+        to the frequency. This code is heavily inspired from the *torchaudio* implementation, see
+        [here](https://pytorch.org/audio/stable/transforms.html) for more details.
 
 
         Note:
@@ -593,7 +593,8 @@ def _stft(self, frames, window):
 
     def _power_to_db(self, mel_spectrogram, a_min=1e-10, ref=1.0):
         """
-        Convert a mel spectrogram from power to db scale, this function is the numpy implementation of librosa.power_to_lb.
+        Convert a mel spectrogram from power to db scale, this function is the numpy implementation of
+        librosa.power_to_lb.
         """
         log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
         log_spec -= 10.0 * np.log10(np.maximum(a_min, ref))
@@ -611,8 +612,8 @@ def _fram_wave(self, waveform: np.array, center: bool = True):
         The window length (self.window_length) defines how much of the signal is contained in each frame, while the hop
         length defines the step between the beginning of each new frame.
 
-        #TODO @Arthur **This method does not support batching yet as we are mainly focus on inference. If you want this to be added
-        feel free to open an issue and ping @arthurzucker on Github**
+        #TODO @Arthur **This method does not support batching yet as we are mainly focus on inference. If you want this
+        to be added feel free to open an issue and ping @arthurzucker on Github**
 
         Args:
             waveform (`np.array`) of shape (sample_length,):
diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
index d9a81c16d32b..1b5df5f3c97e 100644
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -1,4 +1,7 @@
-from typing import List, Union
+from typing import Union
+
+import numpy as np
+import requests
 
 from ..utils import (
     add_end_docstrings,
@@ -6,10 +9,10 @@
     logging,
     requires_backends,
 )
-from .base import PIPELINE_INIT_ARGS, ChunkPipeline
 from .audio_classification import ffmpeg_read
-import requests
-import numpy as np
+from .base import PIPELINE_INIT_ARGS, ChunkPipeline
+
+
 if is_torch_available():
     import torch
 
@@ -55,7 +58,7 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
         requires_backends(self, "audio")
-        
+
         if self.framework != "pt":
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
         # No specific FOR_XXX available yet
@@ -103,16 +106,6 @@ def _sanitize_parameters(self, **kwargs):
 
         return preprocess_params, {}, {}
 
-    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a recording of {}."):
-        n = len(candidate_labels)
-        for i, candidate_label in enumerate(candidate_labels):
-            audio = ffmpeg_read(audio)
-            audios = self.feature_extractor(audios=[audio], return_tensors=self.framework)
-            sequence = hypothesis_template.format(candidate_label)
-            inputs = self.tokenizer(sequence, return_tensors=self.framework)
-            inputs["input_features"] = audios.input_features
-            yield {"is_last": i == n - 1, "candidate_label": candidate_label, **inputs}
-            
     def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a recording of {}."):
         if isinstance(audio, str):
             if audio.startswith("http://") or audio.startswith("https://"):
@@ -130,7 +123,7 @@ def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is
             raise ValueError("We expect a numpy ndarray as input")
         if len(audio.shape) != 1:
             raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
-        
+
         n = len(candidate_labels)
         for i, candidate_label in enumerate(candidate_labels):
             audios = self.feature_extractor(

From 754bed1f414d70d703a765e1065962e451b43513 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 8 Feb 2023 15:39:10 +0000
Subject: [PATCH 087/197] fix copies

---
 src/transformers/models/clap/modeling_clap.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 022d7529d59c..3117e3601e1b 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -153,7 +153,7 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
     return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
 
 
-# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->clap
+# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->clap, image->audio
 def clap_loss(similarity: torch.Tensor) -> torch.Tensor:
     caption_loss = contrastive_loss(similarity)
     audio_loss = contrastive_loss(similarity.t())
@@ -220,7 +220,7 @@ class CLAPAudioModelOutput(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLAP,vision->audio,Vision->Audio,audio->audio
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLAP, vision->audio, Vision->Audio, image->audio
 class CLAPOutput(ModelOutput):
     """
     Args:

From 5f0e467d8e1785499b77566c71bf785cea053ccc Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 16:36:53 +0000
Subject: [PATCH 088/197] all CI tests pass

---
 .../models/clap/configuration_clap.py         |  7 +-
 src/transformers/models/clap/modeling_clap.py | 65 +++++++++++++-----
 tests/models/clap/test_modeling_clap.py       | 68 +++++++++++--------
 3 files changed, 92 insertions(+), 48 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 79330dfee9f3..d32c7a64365d 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -213,7 +213,7 @@ class CLAPAudioConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    model_type = "clap_vision_model"
+    model_type = "clap_audio_model"
 
     def __init__(
         self,
@@ -222,7 +222,7 @@ def __init__(
         spec_size=256,
         hidden_act="gelu",
         patch_size=4,
-        patch_stride=(4, 4),
+        patch_stride=[4, 4],
         num_classes=527,
         hidden_size=96,
         projection_hidden_size=768,
@@ -242,6 +242,7 @@ def __init__(
         qkv_bias=True,
         mlp_ratio=4.0,
         aff_block_r=4,
+        num_hidden_layers=4,
         enable_patch_fusion=False,
         projection_hidden_act="relu",
         layer_norm_eps=1e-5,
@@ -257,7 +258,7 @@ def __init__(
         self.num_classes = num_classes
         self.hidden_size = hidden_size
         self.depths = depths
-        self.num_hidden_layers = len(depths)
+        self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.window_size = window_size
         self.enable_fusion = enable_fusion
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 3117e3601e1b..3d6878717594 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -216,7 +216,35 @@ class CLAPAudioModelOutput(ModelOutput):
     embedding: torch.FloatTensor = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class CLAPAudioModelOutputWithProjection(ModelOutput):
+    """
+    CLAPAudio model output to mimic the output of the original implementation.
+
+    Args:
+        framewise_output (`torch.FloatTensor` of shape `(batch_size, num_frames, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        clipwise_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        fine_grained_embedding (`torch.FloatTensor` of shape `(batch_size, num_frames, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+    """
+
     projection_output: Optional[torch.FloatTensor] = None
+    framewise_output: torch.FloatTensor = None
+    clipwise_output: torch.FloatTensor = None
+    fine_grained_embedding: torch.FloatTensor = None
+    embedding: torch.FloatTensor = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
 @dataclass
@@ -1080,7 +1108,14 @@ def custom_forward(*inputs):
         hidden_states = torch.flatten(hidden_states, 1)
 
         if not return_dict:
-            return (framewise_output, torch.sigmoid(hidden_states), fine_grained_latent_output, latent_output)
+            return (
+                framewise_output,
+                torch.sigmoid(hidden_states),
+                fine_grained_latent_output,
+                latent_output,
+                all_self_attentions,
+                all_reshaped_hidden_states,
+            )
 
         return CLAPAudioModelOutput(
             framewise_output=framewise_output,
@@ -1779,7 +1814,6 @@ def _init_weights(self, module):
             module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
             module.token_type_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
         elif isinstance(module, CLAPTextSelfAttention):
-            factor = self.config.initializer_factor
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.query.weight, std=in_proj_std)
             nn.init.normal_(module.key.weight, std=in_proj_std)
@@ -1796,32 +1830,28 @@ def _init_weights(self, module):
                 CLAPAudioOutput,
             ),
         ):
-            factor = self.config.initializer_factor
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.dense.weight, std=in_proj_std)
         elif isinstance(module, CLAPProjectionLayer):
-            factor = self.config.initializer_factor
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.linear1.weight, std=in_proj_std)
             nn.init.normal_(module.linear2.weight, std=in_proj_std)
         elif isinstance(module, CLAPAudioPatchEmbed):
-            factor = self.config.initializer_factor
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.proj.weight, std=in_proj_std)
         elif isinstance(module, CLAPAudioSelfAttention):
-            factor = self.config.initializer_factor
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.query.weight, std=in_proj_std)
             nn.init.normal_(module.key.weight, std=in_proj_std)
             nn.init.normal_(module.value.weight, std=in_proj_std)
         elif isinstance(module, CLAPAudioPatchMerging):
-            factor = self.config.initializer_factor
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.reduction.weight, std=in_proj_std)
         elif isinstance(module, CLAPAudioEncoder):
-            factor = self.config.initializer_factor
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.head.weight, std=in_proj_std)
+        elif isinstance(module, CLAPFusionBlock):
+            nn.init.normal_(module.linear.weight, std=factor * 0.02)
 
         if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
@@ -1829,11 +1859,13 @@ def _init_weights(self, module):
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
         if isinstance(module, nn.Conv2d):
-            factor = self.config.initializer_factor
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.weight, std=in_proj_std)
             if module.bias is not None:
                 module.bias.data.zero_()
+        if isinstance(module, CLAPModel):
+            nn.init.normal_(module.logit_scale_a, std=factor * 0.02)
+            nn.init.normal_(module.logit_scale_t, std=factor * 0.02)
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, CLAPTextEncoder):
@@ -2246,7 +2278,7 @@ def forward(
             return_dict=return_dict,
         )
 
-        audio_embeds = audio_outputs[-1] if not return_dict else audio_outputs.embedding
+        audio_embeds = audio_outputs[-3] if not return_dict else audio_outputs.embedding
         audio_embeds = self.audio_projection(audio_embeds)
 
         text_embeds = text_outputs[1]
@@ -2422,19 +2454,20 @@ def forward(
             return_dict=return_dict,
         )
 
-        pooled_output = audio_outputs[-1] if not return_dict else audio_outputs.embedding
+        pooled_output = audio_outputs[-3] if not return_dict else audio_outputs.embedding
 
         audio_embeds = self.audio_projection(pooled_output)
 
         if not return_dict:
-            outputs = (audio_embeds, audio_outputs[0]) + audio_outputs[2:]
-            return tuple(output for output in outputs if output is not None)
+            outputs = (audio_embeds, *audio_outputs)
+            return outputs
 
-        return CLAPAudioModelOutput(
+        return CLAPAudioModelOutputWithProjection(
             projection_output=audio_embeds,
-            clipwise_output=audio_outputs.clipwise_output,
             framewise_output=audio_outputs.framewise_output,
+            clipwise_output=audio_outputs.clipwise_output,
+            fine_grained_embedding=audio_outputs.fine_grained_embedding,
             embedding=audio_outputs.embedding,
-            hidden_states=audio_outputs.hidden_states,
             attentions=audio_outputs.attentions,
+            hidden_states=audio_outputs.hidden_states,
         )
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 591ba8d2ca6e..134a6f268e0a 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -62,7 +62,6 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import CLAPProcessor
 
 
 if is_flax_available():
@@ -214,6 +213,43 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.hidden_size, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    @unittest.skip(reason="CLAPAudio does not output any loss term in the forward pass")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -723,31 +759,5 @@ def prepare_img():
 @require_vision
 @require_torch
 class CLAPModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "laion-ai/base"
-        model = CLAPModel.from_pretrained(model_name).to(torch_device)
-        processor = CLAPProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
-        ).to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.logits_per_audio.shape,
-            torch.Size((inputs.input_features.shape[0], inputs.input_ids.shape[0])),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            torch.Size((inputs.input_ids.shape[0], inputs.input_features.shape[0])),
-        )
-
-        expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits_per_audio, expected_logits, atol=1e-3))
+    # TODO!
+    pass

From 47f714f48c0889c34a1a82bc8bef2c88e8956c51 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 16:57:03 +0000
Subject: [PATCH 089/197] make fixup + fix docs

---
 docs/source/en/model_doc/clap.mdx       | 12 ------------
 tests/models/clap/test_modeling_clap.py |  1 -
 2 files changed, 13 deletions(-)

diff --git a/docs/source/en/model_doc/clap.mdx b/docs/source/en/model_doc/clap.mdx
index c5918ca576da..f17eef1fcfaf 100644
--- a/docs/source/en/model_doc/clap.mdx
+++ b/docs/source/en/model_doc/clap.mdx
@@ -44,18 +44,6 @@ The original code can be found [here](https://github.com/LAION-AI/CLAP).
 
 [[autodoc]] CLAPAudioConfig
 
-## CLAPTokenizer
-
-[[autodoc]] CLAPTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## CLAPTokenizerFast
-
-[[autodoc]] CLAPTokenizerFast
-
 ## CLAPFeatureExtractor
 
 [[autodoc]] CLAPFeatureExtractor
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 134a6f268e0a..cfbc001605bf 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -63,7 +63,6 @@
     from PIL import Image
 
 
-
 if is_flax_available():
     import jax.numpy as jnp
 

From 7d3ede511e97a042dc9dedfe3804070d5e4104ea Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 17:04:24 +0000
Subject: [PATCH 090/197] fix docs

---
 docs/source/en/model_doc/clap.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/clap.mdx b/docs/source/en/model_doc/clap.mdx
index f17eef1fcfaf..9f2ced3f109e 100644
--- a/docs/source/en/model_doc/clap.mdx
+++ b/docs/source/en/model_doc/clap.mdx
@@ -57,7 +57,7 @@ The original code can be found [here](https://github.com/LAION-AI/CLAP).
 [[autodoc]] CLAPModel
     - forward
     - get_text_features
-    - get_image_features
+    - get_audio_features
 
 ## CLAPTextModel
 

From 4e5db4b6e7143c4d34d0fbd6e13f7514102fc860 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 17:12:41 +0000
Subject: [PATCH 091/197] fix docs

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 8f477d79d463..2e9a479091e0 100755
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -481,6 +481,8 @@
       sections:
       - local: model_doc/audio-spectrogram-transformer
         title: Audio Spectrogram Transformer
+      - local: model_doc/clap
+        title: CLAP
       - local: model_doc/hubert
         title: Hubert
       - local: model_doc/mctct

From 5d23429f109343e690df050771b57f557a8a0e5e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 8 Feb 2023 17:17:10 +0000
Subject: [PATCH 092/197] update tests pip;eline

---
 ...ipelines_zero_shot_audio_classification.py | 225 ++++++------------
 1 file changed, 69 insertions(+), 156 deletions(-)

diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index a5c1e9ecd44e..476e07cc2241 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -14,24 +14,15 @@
 
 import unittest
 
-from transformers import is_audio_available
-from transformers.pipelines import pipeline
-from transformers.testing_utils import nested_simplify, require_audio, require_tf, require_torch, slow
-
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from datasets import load_dataset
 
+from transformers.pipelines import pipeline
+from transformers.testing_utils import require_torch
 
-if is_audio_available():
-    pass
-else:
-
-    class Audio:
-        @staticmethod
-        def open(*args, **kwargs):
-            pass
+from .test_pipelines_common import PipelineTestCaseMeta
 
 
-@require_audio
+@require_torch
 class ZeroShotAudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
     # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping,
     # and only CLAP would be there for now.
@@ -61,169 +52,91 @@ class ZeroShotAudioClassificationPipelineTests(unittest.TestCase, metaclass=Pipe
 
     #     # Batching
     #     outputs = pipe([audio] * 3, batch_size=2, candidate_labels=["A", "B"])
-
     @require_torch
     def test_small_model_pt(self):
-        audio_classifier = pipeline(
-            model="hf-internal-testing/tiny-random-clap-zero-shot-audio-classification",
-        )
-        audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        output = audio_classifier(audio, candidate_labels=["a", "b", "c"])
-
-        self.assertEqual(
-            nested_simplify(output),
-            [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
-        )
-
-        output = audio_classifier([audio] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
-        self.assertEqual(
-            nested_simplify(output),
-            # Pipeline outputs are supposed to be deterministic and
-            # So we could in theory have real values "A", "B", "C" instead
-            # of ANY(str).
-            # However it seems that in this particular case, the floating
-            # scores are so close, we enter floating error approximation
-            # and the order is not guaranteed anymore with batching.
-            [
-                [
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                ],
-                [
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                ],
-                [
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                ],
-                [
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                ],
-                [
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                ],
-            ],
-        )
+        pass
 
-    @require_tf
-    def test_small_model_tf(self):
-        audio_classifier = pipeline(
-            model="hf-internal-testing/tiny-random-clap-zero-shot-audio-classification", framework="tf"
-        )
-        audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        output = audio_classifier(audio, candidate_labels=["a", "b", "c"])
+    # @require_torch
+    # def test_small_model_pt(self):
+    #     audio_classifier = pipeline(
+    #         model="hf-internal-testing/tiny-random-clap-zero-shot-audio-classification",
+    #     )
+    #     audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    #     output = audio_classifier(audio, candidate_labels=["a", "b", "c"])
 
-        self.assertEqual(
-            nested_simplify(output),
-            [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
-        )
+    #     self.assertEqual(
+    #         nested_simplify(output),
+    #         [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
+    #     )
 
-        output = audio_classifier([audio] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
-        self.assertEqual(
-            nested_simplify(output),
-            # Pipeline outputs are supposed to be deterministic and
-            # So we could in theory have real values "A", "B", "C" instead
-            # of ANY(str).
-            # However it seems that in this particular case, the floating
-            # scores are so close, we enter floating error approximation
-            # and the order is not guaranteed anymore with batching.
-            [
-                [
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                ],
-                [
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                ],
-                [
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                ],
-                [
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                ],
-                [
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                    {"score": 0.333, "label": ANY(str)},
-                ],
-            ],
-        )
+    #     output = audio_classifier([audio] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
+    #     self.assertEqual(
+    #         nested_simplify(output),
+    #         # Pipeline outputs are supposed to be deterministic and
+    #         # So we could in theory have real values "A", "B", "C" instead
+    #         # of ANY(str).
+    #         # However it seems that in this particular case, the floating
+    #         # scores are so close, we enter floating error approximation
+    #         # and the order is not guaranteed anymore with batching.
+    #         [
+    #             [
+    #                 {"score": 0.333, "label": ANY(str)},
+    #                 {"score": 0.333, "label": ANY(str)},
+    #                 {"score": 0.333, "label": ANY(str)},
+    #             ],
+    #             [
+    #                 {"score": 0.333, "label": ANY(str)},
+    #                 {"score": 0.333, "label": ANY(str)},
+    #                 {"score": 0.333, "label": ANY(str)},
+    #             ],
+    #             [
+    #                 {"score": 0.333, "label": ANY(str)},
+    #                 {"score": 0.333, "label": ANY(str)},
+    #                 {"score": 0.333, "label": ANY(str)},
+    #             ],
+    #             [
+    #                 {"score": 0.333, "label": ANY(str)},
+    #                 {"score": 0.333, "label": ANY(str)},
+    #                 {"score": 0.333, "label": ANY(str)},
+    #             ],
+    #             [
+    #                 {"score": 0.333, "label": ANY(str)},
+    #                 {"score": 0.333, "label": ANY(str)},
+    #                 {"score": 0.333, "label": ANY(str)},
+    #             ],
+    #         ],
+    #     )
 
-    @slow
+    # @slow
     @require_torch
     def test_large_model_pt(self):
         audio_classifier = pipeline(
             task="zero-shot-audio-classification",
-            model="openai/clap-vit-base-patch32",
+            model="ybelkada/clap-htsat-unfused",
         )
         # This is an audio of 2 cats with remotes and no planes
-        audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        output = audio_classifier(audio, candidate_labels=["cat", "plane", "remote"])
-
-        self.assertEqual(
-            nested_simplify(output),
-            [
-                {"score": 0.511, "label": "remote"},
-                {"score": 0.485, "label": "cat"},
-                {"score": 0.004, "label": "plane"},
-            ],
-        )
+        dataset = load_dataset("ashraq/esc50")
+        audio = dataset["train"]["audio"][-1]["array"]
+        output = audio_classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
 
-        output = audio_classifier([audio] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2)
-        self.assertEqual(
-            nested_simplify(output),
-            [
-                [
-                    {"score": 0.511, "label": "remote"},
-                    {"score": 0.485, "label": "cat"},
-                    {"score": 0.004, "label": "plane"},
-                ],
-            ]
-            * 5,
-        )
-
-    @slow
-    @require_tf
-    def test_large_model_tf(self):
-        audio_classifier = pipeline(
-            task="zero-shot-audio-classification", model="openai/clap-vit-base-patch32", framework="tf"
-        )
-        # This is an audio of 2 cats with remotes and no planes
-        audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        output = audio_classifier(audio, candidate_labels=["cat", "plane", "remote"])
         self.assertEqual(
-            nested_simplify(output),
+            output,
             [
-                {"score": 0.511, "label": "remote"},
-                {"score": 0.485, "label": "cat"},
-                {"score": 0.004, "label": "plane"},
+                {"score": 0.9990969896316528, "label": "Sound of a dog"},
+                {"score": 0.0009030875517055392, "label": "Sound of vaccum cleaner"},
             ],
         )
 
-        output = audio_classifier([audio] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2)
+        output = audio_classifier([audio] * 5, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
         self.assertEqual(
-            nested_simplify(output),
+            output,
             [
                 [
-                    {"score": 0.511, "label": "remote"},
-                    {"score": 0.485, "label": "cat"},
-                    {"score": 0.004, "label": "plane"},
-                ],
+                    {"score": 0.9990969896316528, "label": "Sound of a dog"},
+                    {"score": 0.0009030875517055392, "label": "Sound of vaccum cleaner"},
+                ]
             ]
             * 5,
         )
+        # TODO batching will be supported in next PR, the base pipeline needs to be modified
+        # output = audio_classifier([audio] * 5, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"], batch_size=5)

From aaa68ce519986443dc94b709375a5b9a8d4ab9d7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 8 Feb 2023 17:17:23 +0000
Subject: [PATCH 093/197] update zero shot pipeline

---
 src/transformers/pipelines/zero_shot_audio_classification.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
index 1b5df5f3c97e..9a839ee36893 100644
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -7,7 +7,6 @@
     add_end_docstrings,
     is_torch_available,
     logging,
-    requires_backends,
 )
 from .audio_classification import ffmpeg_read
 from .base import PIPELINE_INIT_ARGS, ChunkPipeline
@@ -57,8 +56,6 @@ class ZeroShotAudioClassificationPipeline(ChunkPipeline):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-        requires_backends(self, "audio")
-
         if self.framework != "pt":
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
         # No specific FOR_XXX available yet

From c8e9c73cb10add43c4b9269594f518fc26965905 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 8 Feb 2023 17:17:42 +0000
Subject: [PATCH 094/197] update feature extraction clap

---
 src/transformers/models/clap/feature_extraction_clap.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 4b3114cd20ea..a673ad9f2b3c 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -160,8 +160,9 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[n
 
         magnitudes = np.abs(stft) ** 2
         mel_spec = np.matmul(mel_filters.T, magnitudes)
-        log_mel_spec = self._power_to_db(mel_spec)
-        return log_mel_spec.T
+        log_mel_spec = self._power_to_db(mel_spec).T
+        log_mel_spec = np.asarray(log_mel_spec, np.float32)
+        return log_mel_spec
 
     @staticmethod
     # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
@@ -266,8 +267,8 @@ def _get_input_mel(self, waveform: np.array, max_length, truncation, padding) ->
     def __call__(
         self,
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
-        truncation: str = "fusion",
-        padding: Optional[str] = "repeatpad",
+        truncation: str = None,
+        padding: Optional[str] = None,
         max_length: Optional[int] = None,
         sampling_rate: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,

From c54fce3d4b37ca8ebdec2e8b8b4322a73826da73 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 8 Feb 2023 17:17:52 +0000
Subject: [PATCH 095/197] update tokenization auto

---
 src/transformers/models/auto/tokenization_auto.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index cc91c11617f0..f39e447966fb 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -90,6 +90,13 @@
             ),
             ("canine", ("CanineTokenizer", None)),
             ("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+            (
+                "clap",
+                (
+                    "RobertaTokenizer",
+                    "RobertaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             (
                 "clip",
                 (

From 4debc3df79eef884253a624cd8dba9a414f58366 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 8 Feb 2023 17:31:42 +0000
Subject: [PATCH 096/197] use nested simplify

---
 ...t_pipelines_zero_shot_audio_classification.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index 476e07cc2241..8f4d7cbc3187 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -17,7 +17,7 @@
 from datasets import load_dataset
 
 from transformers.pipelines import pipeline
-from transformers.testing_utils import require_torch
+from transformers.testing_utils import nested_simplify, require_torch
 
 from .test_pipelines_common import PipelineTestCaseMeta
 
@@ -120,21 +120,21 @@ def test_large_model_pt(self):
         output = audio_classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
 
         self.assertEqual(
-            output,
+            nested_simplify(output),
             [
-                {"score": 0.9990969896316528, "label": "Sound of a dog"},
-                {"score": 0.0009030875517055392, "label": "Sound of vaccum cleaner"},
+                {"score": 0.999, "label": "Sound of a dog"},
+                {"score": 0.001, "label": "Sound of vaccum cleaner"},
             ],
         )
 
         output = audio_classifier([audio] * 5, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
         self.assertEqual(
-            output,
+            nested_simplify(output),
             [
                 [
-                    {"score": 0.9990969896316528, "label": "Sound of a dog"},
-                    {"score": 0.0009030875517055392, "label": "Sound of vaccum cleaner"},
-                ]
+                    {"score": 0.999, "label": "Sound of a dog"},
+                    {"score": 0.001, "label": "Sound of vaccum cleaner"},
+                ],
             ]
             * 5,
         )

From 0be1e66f97d916c8c23bc8d4a84f5f18f2905b92 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 8 Feb 2023 17:54:26 +0000
Subject: [PATCH 097/197] update pipeline tests

---
 .../test_pipelines_zero_shot_audio_classification.py     | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index 8f4d7cbc3187..f79f321830d9 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -56,6 +56,10 @@ class ZeroShotAudioClassificationPipelineTests(unittest.TestCase, metaclass=Pipe
     def test_small_model_pt(self):
         pass
 
+
+    def test_small_model_tf(self):
+        pass
+
     # @require_torch
     # def test_small_model_pt(self):
     #     audio_classifier = pipeline(
@@ -114,7 +118,7 @@ def test_large_model_pt(self):
             task="zero-shot-audio-classification",
             model="ybelkada/clap-htsat-unfused",
         )
-        # This is an audio of 2 cats with remotes and no planes
+        # This is an audio of a dog
         dataset = load_dataset("ashraq/esc50")
         audio = dataset["train"]["audio"][-1]["array"]
         output = audio_classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
@@ -140,3 +144,6 @@ def test_large_model_pt(self):
         )
         # TODO batching will be supported in next PR, the base pipeline needs to be modified
         # output = audio_classifier([audio] * 5, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"], batch_size=5)
+
+    def test_large_model_tf(self):
+        pass
\ No newline at end of file

From c2e207bb9b05a750450b77bebf1b4d0a83b1624c Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 8 Feb 2023 18:57:49 +0100
Subject: [PATCH 098/197] Apply suggestions from code review

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/models/clap/modeling_clap.py | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 3d6878717594..d02d89d741ff 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+# Copyright 2023 The LAION-AI Team Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -60,9 +60,9 @@ def do_mixup(hidden_states, mixup_lambda):
     because the decision boundary becomes smooth.
 
     Args:
-        hidden_states: (`torch.FloatTensor` of shape (batch_size, seq_length, hidden_size))
+        hidden_states (`torch.FloatTensor` of shape (batch_size, seq_length, hidden_size)) :
             Input hidden states
-        mixup_lambda: (`torch.FloatTensor`)
+        mixup_lambda (`torch.FloatTensor`):
             Mixing ratio sampled from the Beta distribution
     """
     out = (
@@ -78,9 +78,9 @@ def interpolate(hidden_states, ratio):
     Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.
 
     Args:
-        hidden_states: (`torch.FloatTensor` of shape (batch_size, time_steps, classes_num))
+        hidden_states (`torch.FloatTensor` of shape (batch_size, time_steps, classes_num)):
             Input hidden states
-        ratio: (`int`)
+        ratio (`int`):
             The ratio of the length of the output to the length of the input.
     """
     (batch_size, time_steps, classes_num) = hidden_states.shape
@@ -96,9 +96,9 @@ def window_partition(hidden_states, window_size):
     num_channels)`
 
     Args:
-        hidden_states: (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`)
+        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
             Input hidden states
-        window_size: (`int`)
+        window_size (`int`):
             Window size
     """
     batch_size, height, width, num_channels = hidden_states.shape
@@ -301,9 +301,8 @@ def forward(self, hidden_states):
             return hidden_states
 
         keep_prob = 1 - self.drop_prob
-        shape = (hidden_states.shape[0],) + (1,) * (
-            hidden_states.ndim - 1
-        )  # work with diff dim tensors, not just 2D ConvNets
+        # work with diff dim tensors, not just 2D ConvNets
+        shape = (hidden_states.shape[0],) + (1,) * (hidden_states.ndim - 1)  
 
         random_tensor = keep_prob + torch.rand(shape, dtype=hidden_states.dtype, device=hidden_states.device)
         random_tensor.floor_()  # binarize
@@ -988,7 +987,6 @@ def forward(
         always_partition: Optional[bool] = False,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, CLAPAudioModelOutput]:
-        # print(input_features.shape, self.enable_fusion)
 
         input_features = input_features.transpose(1, 3)
         hidden_states = self.bn0(input_features)
@@ -1001,7 +999,7 @@ def forward(
 
         hidden_states = self.reshape_wav2img(hidden_states)
 
-        _, _, frames_num, _ = hidden_states.shape
+        frames_num = hidden_states.shape[2]
 
         hidden_states = self.patch_embed(hidden_states, is_longer_list_idx)
 

From e47c69672c7d6b9b72c390d7ac3ebae902983f57 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 18:00:13 +0000
Subject: [PATCH 099/197] split in two lines

---
 src/transformers/models/clap/modeling_clap.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index d02d89d741ff..9bad1de15259 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -65,10 +65,10 @@ def do_mixup(hidden_states, mixup_lambda):
         mixup_lambda (`torch.FloatTensor`):
             Mixing ratio sampled from the Beta distribution
     """
-    out = (
-        hidden_states.transpose(0, -1) * mixup_lambda
-        + torch.flip(hidden_states, dims=[0]).transpose(0, -1) * (1 - mixup_lambda)
-    ).transpose(0, -1)
+    intermediate_hidden_states = hidden_states.transpose(0, -1) * mixup_lambda
+    flipped_hidden_states = torch.flip(hidden_states, dims=[0]).transpose(0, -1) * (1 - mixup_lambda)
+    out = intermediate_hidden_states + flipped_hidden_states
+    out = out.transpose(0, -1)
     return out
 
 
From dbab96e8cfabeebd206940d82ad5fb2492ecb72a Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 18:06:55 +0000
Subject: [PATCH 100/197] fixes

---
 src/transformers/models/clap/modeling_clap.py | 32 +++++++++----------
 ...ipelines_zero_shot_audio_classification.py |  3 +-
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 9bad1de15259..2a31180bf26c 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -114,13 +114,13 @@ def window_partition(hidden_states, window_size):
 def window_reverse(windows, window_size, height, width):
     """
     Args:
-        windows: (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`)
+        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
             Input windows
-        window_size: (`int`)
+        window_size (`int`):
             Window size
-        height: (`int`)
+        height (`int`):
             Height of the resized audio
-        width: (`int`)
+        width (`int`):
             Width of the resized audio
     """
     batch_size = int(windows.shape[0] / (height * width / window_size / window_size))
@@ -150,7 +150,8 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l
 # contrastive loss function, adapted from
 # https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/Clip.html
 def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
-    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
+    labels = torch.arange(len(logits), device=logits.device)
+    return nn.functional.cross_entropy(logits, labels)
 
 
 # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->clap, image->audio
@@ -302,7 +303,7 @@ def forward(self, hidden_states):
 
         keep_prob = 1 - self.drop_prob
         # work with diff dim tensors, not just 2D ConvNets
-        shape = (hidden_states.shape[0],) + (1,) * (hidden_states.ndim - 1)  
+        shape = (hidden_states.shape[0],) + (1,) * (hidden_states.ndim - 1)
 
         random_tensor = keep_prob + torch.rand(shape, dtype=hidden_states.dtype, device=hidden_states.device)
         random_tensor.floor_()  # binarize
@@ -399,6 +400,7 @@ def __init__(self, config: CLAPAudioConfig):
 
     def forward(self, hidden_states, is_longer_idx=None):
         if self.enable_fusion:
+            # retrieve the last mel as we have transposed the input
             global_hidden_states = hidden_states[:, 0:1, :, :]
 
             # global processing
@@ -428,19 +430,16 @@ def forward(self, hidden_states, is_longer_idx=None):
                 local_hidden_states = local_hidden_states.permute((0, 2, 3, 1, 4)).contiguous().flatten(3)
                 output_batch_size, output_num_channels, output_height, _ = local_hidden_states.size()
 
-                if local_hidden_states.size(-1) < output_width:
+                local_width = local_hidden_states.size(-1)
+                if local_width < output_width:
+                    padded_hidden_states = torch.zeros(
+                        (output_batch_size, output_num_channels, output_height, output_width - local_width)
+                    ).to(global_hidden_states.device)
+
                     local_hidden_states = torch.cat(
                         [
                             local_hidden_states,
-                            torch.zeros(
-                                (
-                                    output_batch_size,
-                                    output_num_channels,
-                                    output_height,
-                                    output_width - local_hidden_states.size(-1),
-                                ),
-                                device=global_hidden_states.device,
-                            ),
+                            padded_hidden_states,
                         ],
                         dim=-1,
                     )
@@ -987,7 +986,6 @@ def forward(
         always_partition: Optional[bool] = False,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, CLAPAudioModelOutput]:
-
         input_features = input_features.transpose(1, 3)
         hidden_states = self.bn0(input_features)
         hidden_states = hidden_states.transpose(1, 3)
diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index f79f321830d9..9eb684890728 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -56,7 +56,6 @@ class ZeroShotAudioClassificationPipelineTests(unittest.TestCase, metaclass=Pipe
     def test_small_model_pt(self):
         pass
 
-
     def test_small_model_tf(self):
         pass
 
@@ -146,4 +145,4 @@ def test_large_model_pt(self):
         # output = audio_classifier([audio] * 5, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"], batch_size=5)
 
     def test_large_model_tf(self):
-        pass
\ No newline at end of file
+        pass

From 9c83ff7a11fafb50f939d0f25974e10356fe149e Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 18:17:00 +0000
Subject: [PATCH 101/197] refactor

---
 src/transformers/models/clap/modeling_clap.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 2a31180bf26c..73ce0a2ad572 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -929,9 +929,10 @@ def __init__(self, config):
 
         self.avgpool = nn.AdaptiveAvgPool1d(1)
 
-        SF = config.spec_size // (2 ** (len(config.depths) - 1)) // self.patch_embed.patch_stride[0] // self.freq_ratio
+        division_factor = ((2 ** (len(config.depths) - 1)) * self.patch_embed.patch_stride[0] * self.freq_ratio)
+        kernel_size = config.spec_size // division_factor
         self.tscam_conv = nn.Conv2d(
-            in_channels=self.num_features, out_channels=config.num_classes, kernel_size=(SF, 3), padding=(0, 1)
+            in_channels=self.num_features, out_channels=config.num_classes, kernel_size=(kernel_size, 3), padding=(0, 1)
         )
         self.head = nn.Linear(config.num_classes, config.num_classes)
 

From b5b929a7b2c567b8b578b54c34fa5ec14b164e1d Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 18:31:08 +0000
Subject: [PATCH 102/197] clean up

---
 src/transformers/models/clap/modeling_clap.py | 79 +++++++++----------
 tests/models/clap/test_modeling_clap.py       |  2 +-
 2 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 73ce0a2ad572..af2a0386ff63 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -225,6 +225,8 @@ class CLAPAudioModelOutputWithProjection(ModelOutput):
     CLAPAudio model output to mimic the output of the original implementation.
 
     Args:
+        audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
+            The audio embeddings obtained by applying the projection layer to the pooler_output.
         framewise_output (`torch.FloatTensor` of shape `(batch_size, num_frames, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         clipwise_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
@@ -239,7 +241,7 @@ class CLAPAudioModelOutputWithProjection(ModelOutput):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
     """
 
-    projection_output: Optional[torch.FloatTensor] = None
+    audio_embeds: Optional[torch.FloatTensor] = None
     framewise_output: torch.FloatTensor = None
     clipwise_output: torch.FloatTensor = None
     fine_grained_embedding: torch.FloatTensor = None
@@ -929,10 +931,13 @@ def __init__(self, config):
 
         self.avgpool = nn.AdaptiveAvgPool1d(1)
 
-        division_factor = ((2 ** (len(config.depths) - 1)) * self.patch_embed.patch_stride[0] * self.freq_ratio)
+        division_factor = (2 ** (len(config.depths) - 1)) * self.patch_embed.patch_stride[0] * self.freq_ratio
         kernel_size = config.spec_size // division_factor
         self.tscam_conv = nn.Conv2d(
-            in_channels=self.num_features, out_channels=config.num_classes, kernel_size=(kernel_size, 3), padding=(0, 1)
+            in_channels=self.num_features,
+            out_channels=config.num_classes,
+            kernel_size=(kernel_size, 3),
+            padding=(0, 1),
         )
         self.head = nn.Linear(config.num_classes, config.num_classes)
 
@@ -955,8 +960,6 @@ def reshape_wav2img(self, hidden_states):
                 hidden_states, (hidden_states.shape[2], target_F), mode="bicubic", align_corners=True
             )
 
-        # hidden_states = hidden_states.contiguous().view(hidden_states.shape[0], hidden_states.shape[1], hidden_states.shape[-1] * self.freq_ratio, hidden_states.shape[2] // self.freq_ratio)
-
         hidden_states = hidden_states.permute(0, 1, 3, 2).contiguous()
         hidden_states = hidden_states.reshape(
             hidden_states.shape[0],
@@ -1092,8 +1095,6 @@ def custom_forward(*inputs):
         latent_output = self.avgpool(torch.flatten(hidden_states, 2))
         latent_output = torch.flatten(latent_output, 1)
 
-        # display the attention map, if needed
-
         hidden_states = self.tscam_conv(hidden_states)
         hidden_states = torch.flatten(hidden_states, 2)  # B, C, T
 
@@ -1174,8 +1175,8 @@ def custom_forward(*inputs):
 CLAP_AUDIO_INPUTS_DOCSTRING = r"""
     Args:
         input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoFeatureExtractor`]. See [`CLAPFeatureExtractor.__call__`] for details.
+            Input audio features. This should be returnes by the [`CLAPFeatureExtractor`] class that you can also
+            retrieve from [`AutoFeatureExtractor`]. See [`CLAPFeatureExtractor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -1209,8 +1210,8 @@ def custom_forward(*inputs):
 
             [What are position IDs?](../glossary#position-ids)
         input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoFeatureExtractor`]. See [`CLAPFeatureExtractor.__call__`] for details.
+            Input audio features. This should be returnes by the [`CLAPFeatureExtractor`] class that you can also
+            retrieve from [`AutoFeatureExtractor`]. See [`CLAPFeatureExtractor.__call__`] for details.
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
         output_attentions (`bool`, *optional*):
@@ -1898,20 +1899,19 @@ def forward(
         Examples:
 
         ```python
-        >>> import requests
+        >>> from datasets import load_dataset
         >>> from transformers import AutoProcessor, CLAPAudioModel
 
-        >>> model = CLAPAudioModel.from_pretrained("laionai/clap-hsat-tiny")
-        >>> processor = AutoProcessor.from_pretrained("laionai/clap-hsat-tiny")
+        >>> dataset = load_dataset("ashraq/esc50")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
-        >>> url = "http://audios.cocodataset.org/val2017/000000039769.jpg"
-        >>> audio = Image.open(requests.get(url, stream=True).raw)
+        >>> model = CLAPAudioModel.from_pretrained("laionai/clap-hsat-fused")
+        >>> processor = AutoProcessor.from_pretrained("laionai/clap-hsat-fused")
 
-        >>> inputs = processor(audios=audio, return_tensors="pt")
+        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
 
         >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        >>> last_hidden_state = outputs.embedding
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -2157,8 +2157,8 @@ def get_text_features(
         ```python
         >>> from transformers import AutoTokenizer, CLAPModel
 
-        >>> model = CLAPModel.from_pretrained("laion-ai/clap-htst-unfused-base")
-        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htst-unfused-base")
+        >>> model = CLAPModel.from_pretrained("laion-ai/clap-htsat-unfused")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htsat-unfused")
 
         >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
@@ -2234,23 +2234,22 @@ def forward(
         Examples:
 
         ```python
-        >>> from PIL import Image
-        >>> import requests
+        >>> from dataset import load_dataset
         >>> from transformers import AutoProcessor, CLAPModel
 
+        >>> dataset = load_dataset("ashraq/esc50")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
         >>> model = CLAPModel.from_pretrained("laion-ai/clap-htst-unfused-base")
         >>> processor = AutoProcessor.from_pretrained("laion-ai/clap-htst-unfused-base")
 
-        >>> url = "http://audios.cocodataset.org/val2017/000000039769.jpg"
-        >>> # TODO audio here
+        >>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]
 
-        >>> inputs = processor(
-        ...     text=["a photo of a cat", "a photo of a dog"], audios=audio, return_tensors="pt", padding=True
-        ... )
+        >>> inputs = processor(text=input_text, audio=audio_sample, return_tensors="pt", padding=True)
 
         >>> outputs = model(**inputs)
         >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
-        >>> probs = logits_per_audio.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
         ```"""
         # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -2351,8 +2350,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, CLAPTextModelWithProjection
 
-        >>> model = CLAPTextModelWithProjection.from_pretrained("laion-ai/clap-htst-unfused-base")
-        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htst-unfused-base")
+        >>> model = CLAPTextModelWithProjection.from_pretrained("laion-ai/clap-htsat-unfused")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htsat-unfused")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
 
@@ -2423,18 +2422,16 @@ def forward(
         Examples:
 
         ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, CLAPAudioModelWithProjection
-
-        >>> model = CLAPAudioModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> from datasets import load_dataset
+        >>> from transformers import CLAPAudioModelWithProjection, CLAPProcessor
 
-        >>> url = "http://audios.cocodataset.org/val2017/000000039769.jpg"
-        >>> audio = Image.open(requests.get(url, stream=True).raw)
+        >>> model = CLAPAudioModelWithProjection.from_pretrained("laion-ai/clap-htsat-unfused")
+        >>> processor = CLAPProcessor.from_pretrained("laion-ai/clap-htsat-unfused")
 
-        >>> inputs = processor(audios=audio, return_tensors="pt")
+        >>> dataset = load_dataset("ashraq/esc50")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
+        >>> inputs = processor(audio=audio_sample, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> audio_embeds = outputs.audio_embeds
         ```"""
@@ -2460,7 +2457,7 @@ def forward(
             return outputs
 
         return CLAPAudioModelOutputWithProjection(
-            projection_output=audio_embeds,
+            audio_embeds=audio_embeds,
             framewise_output=audio_outputs.framewise_output,
             clipwise_output=audio_outputs.clipwise_output,
             fine_grained_embedding=audio_outputs.fine_grained_embedding,
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index cfbc001605bf..4d83681a0911 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -170,7 +170,7 @@ def create_and_check_model_with_projection(self, config, input_features):
         model.eval()
         with torch.no_grad():
             result = model(input_features)
-        self.parent.assertEqual(result.projection_output.shape, (self.batch_size, self.hidden_size))
+        self.parent.assertEqual(result.audio_embeds.shape, (self.batch_size, self.hidden_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()

From 5e3aa5f6c1a8ba87db289b442d9a8e2fece5e0e0 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 09:19:05 +0000
Subject: [PATCH 103/197] add integration tests

---
 tests/models/clap/test_modeling_clap.py | 66 +++++++++++++++++++++++--
 1 file changed, 62 insertions(+), 4 deletions(-)

diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 4d83681a0911..d2809afe7e0f 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -29,7 +29,6 @@
     is_flax_available,
     is_pt_flax_cross_test,
     require_torch,
-    require_vision,
     slow,
     torch_device,
 )
@@ -53,6 +52,7 @@
         CLAPAudioModel,
         CLAPAudioModelWithProjection,
         CLAPModel,
+        CLAPProcessor,
         CLAPTextModel,
         CLAPTextModelWithProjection,
     )
@@ -755,8 +755,66 @@ def prepare_img():
     return im
 
 
-@require_vision
+@slow
 @require_torch
 class CLAPModelIntegrationTest(unittest.TestCase):
-    # TODO!
-    pass
+    paddings = ["repeatpad", "repeat", "pad"]
+
+    def test_integration_unfused(self):
+        EXPECTED_MEANS_UNFUSED = {
+            "repeatpad": 0.0024,
+            "pad": 0.0020,
+            "repeat": 0.0023,
+        }
+
+        from datasets import load_dataset
+
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio_sample = librispeech_dummy[-1]
+
+        model_id = "ybelkada/clap-htsat-unfused"
+
+        model = CLAPModel.from_pretrained(model_id).to(torch_device)
+        processor = CLAPProcessor.from_pretrained(model_id)
+
+        for padding in self.paddings:
+            inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding).to(
+                torch_device
+            )
+
+            audio_embed = model.get_audio_features(**inputs)
+            expected_mean = EXPECTED_MEANS_UNFUSED[padding]
+
+            self.assertTrue(
+                torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
+            )
+
+    def test_integration_fused(self):
+        EXPECTED_MEANS_FUSED = {
+            "repeatpad": 0.00069,
+            "repeat": 0.00196,
+            "pad": -0.000379,
+        }
+
+        from datasets import load_dataset
+
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio_sample = librispeech_dummy[-1]
+
+        model_id = "ybelkada/clap-htsat-fused"
+
+        model = CLAPModel.from_pretrained(model_id).to(torch_device)
+        processor = CLAPProcessor.from_pretrained(model_id)
+
+        for padding in self.paddings:
+            inputs = processor(
+                audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding, truncation="fusion"
+            ).to(torch_device)
+            inputs["is_longer"] = torch.tensor([False])
+
+            audio_embed = model.get_audio_features(**inputs)
+            expected_mean = EXPECTED_MEANS_FUSED[padding]
+
+            self.assertTrue(
+                torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
+            )

From 2ea06b8d8dabe4fdee188fb8e83506e770ff2587 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 09:32:02 +0000
Subject: [PATCH 104/197] update config docstring

---
 .../models/clap/configuration_clap.py         | 93 ++++++++++++++-----
 1 file changed, 68 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index d32c7a64365d..4a95f24a0bee 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -84,7 +84,7 @@ class CLAPTextConfig(PretrainedConfig):
 
     Examples:
 
-    ```python
+    ```python 
     >>> from transformers import CLAPTextConfig, CLAPTextModel
 
     >>> # Initializing a RoBERTa configuration
@@ -172,31 +172,66 @@ class CLAPAudioConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 8):
+            [description]
+        mel_bins (`int`, *optional*, defaults to 64):
+            [description]
+        spec_size (`int`, *optional*, defaults to 256):
+            [description]
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            [description]
+        patch_size (`int`, *optional*, defaults to 4):
+            [description]
+        patch_stride (`list`, *optional*, defaults to `[4, 4]`):
+            [description]
+        num_classes (`int`, *optional*, defaults to 527):
+            [description]
+        hidden_size (`int`, *optional*, defaults to 96):
+            [description]
+        projection_hidden_size (`int`, *optional*, defaults to 768):
+            [description]
+        depths (`list`, *optional*, defaults to `[2, 2, 6, 2]`):
+            [description]
+        num_attention_heads (`list`, *optional*, defaults to `[4, 8, 16, 32]`):
+            [description]
+        enable_fusion (`bool`, *optional*, defaults to `False`):
+            [description]
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            [description]
+        fusion_type (`[type]`, *optional*):
+            [description]
         image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 32):
-            The size (resolution) of each patch.
-        hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"relu"`,
-            `"relu"`, `"selu"` and `"relu_new"` ``"relu"` are supported. layer_norm_eps (`float`, *optional*, defaults
-            to 1e-5): The epsilon used by the layer normalization layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1):
-            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
-            testing).
+            [description]
+        input_channels (`int`, *optional*, defaults to 3):
+            [description]
+        patch_embed_input_channels (`int`, *optional*, defaults to 1):
+            [description]
+        flatten_patch_embeds (`bool`, *optional*, defaults to `True`):
+            [description]
+        patch_embeds_hidden_size (`int`, *optional*, defaults to 96):
+            [description]
+        enable_patch_layer_norm (`bool`, *optional*, defaults to `True`):
+            [description]
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            [description]
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            [description]
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            [description]
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            [description]
+        aff_block_r (`int`, *optional*, defaults to 4):
+            [description]
+        num_hidden_layers (`int`, *optional*, defaults to 4):
+            [description]
+        enable_patch_fusion (`bool`, *optional*, defaults to `False`):
+            [description]
+        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
+            [description]
+        layer_norm_eps (`[type]`, *optional*, defaults to `1e-5`):
+            [description]
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            [description]
 
     Example:
 
@@ -318,6 +353,14 @@ class CLAPConfig(PretrainedConfig):
             Dimentionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
             The inital value of the *logit_scale* paramter. Default is used as per the original CLAP implementation.
+        fusion_num_hidden_layers (`int`, *optional*, defaults to 2):
+            [description]
+        projection_dim (`int`, *optional*, defaults to 512):
+            [description]
+        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
+            [description]
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            [description]
         kwargs (*optional*):
             Dictionary of keyword arguments.
 

From bc20ff4e9c0c825a894e9362e5466ff465d59fe8 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 09:32:14 +0000
Subject: [PATCH 105/197] style

---
 .../pipelines/test_pipelines_zero_shot_audio_classification.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index f79f321830d9..9eb684890728 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -56,7 +56,6 @@ class ZeroShotAudioClassificationPipelineTests(unittest.TestCase, metaclass=Pipe
     def test_small_model_pt(self):
         pass
 
-
     def test_small_model_tf(self):
         pass
 
@@ -146,4 +145,4 @@ def test_large_model_pt(self):
         # output = audio_classifier([audio] * 5, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"], batch_size=5)
 
     def test_large_model_tf(self):
-        pass
\ No newline at end of file
+        pass

From 312dd92f425ad94d6f46c146330359662b5391f9 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 09:35:34 +0000
Subject: [PATCH 106/197] update processor

---
 src/transformers/models/clap/configuration_clap.py | 2 +-
 tests/models/clap/test_processor_clap.py           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 4a95f24a0bee..3193d4422603 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -84,7 +84,7 @@ class CLAPTextConfig(PretrainedConfig):
 
     Examples:
 
-    ```python 
+    ```python
     >>> from transformers import CLAPTextConfig, CLAPTextModel
 
     >>> # Initializing a RoBERTa configuration
diff --git a/tests/models/clap/test_processor_clap.py b/tests/models/clap/test_processor_clap.py
index 97904632f496..0179f442ddc0 100644
--- a/tests/models/clap/test_processor_clap.py
+++ b/tests/models/clap/test_processor_clap.py
@@ -35,7 +35,7 @@
 @require_sentencepiece
 class CLAPProcessorTest(unittest.TestCase):
     def setUp(self):
-        self.checkpoint = "laionai/clap-tiny-hsat"
+        self.checkpoint = "ybelkada/clap-htsat-unfused"
         self.tmpdirname = tempfile.mkdtemp()
 
     def get_tokenizer(self, **kwargs):

From 3cd99bf48deb7147f75b16e7ae89075025bf778f Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 10:18:32 +0000
Subject: [PATCH 107/197] fix processor test

---
 tests/models/clap/test_processor_clap.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/models/clap/test_processor_clap.py b/tests/models/clap/test_processor_clap.py
index 0179f442ddc0..63f8cfbc15fd 100644
--- a/tests/models/clap/test_processor_clap.py
+++ b/tests/models/clap/test_processor_clap.py
@@ -16,7 +16,7 @@
 import tempfile
 import unittest
 
-from transformers import RobertaTokenizer
+from transformers import RobertaTokenizer, RobertaTokenizerFast
 from transformers.testing_utils import require_sentencepiece, require_torchaudio
 from transformers.utils import is_torchvision_available
 
@@ -57,7 +57,7 @@ def test_save_load_pretrained_default(self):
         processor = CLAPProcessor.from_pretrained(self.tmpdirname)
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, RobertaTokenizer)
+        self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
 
         self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
         self.assertIsInstance(processor.feature_extractor, CLAPFeatureExtractor)
@@ -74,7 +74,7 @@ def test_save_load_pretrained_additional_features(self):
         )
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, RobertaTokenizer)
+        self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
 
         self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
         self.assertIsInstance(processor.feature_extractor, CLAPFeatureExtractor)
@@ -88,7 +88,7 @@ def test_feature_extractor(self):
         raw_speech = floats_list((3, 1000))
 
         input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
-        input_processor = processor(raw_speech, return_tensors="np")
+        input_processor = processor(audios=raw_speech, return_tensors="np")
 
         for key in input_feat_extract.keys():
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
@@ -128,7 +128,7 @@ def test_model_input_names(self):
         processor = CLAPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
         self.assertListEqual(
-            processor.model_input_names,
+            processor.model_input_names[2:],
             feature_extractor.model_input_names,
             msg="`processor` and `feature_extractor` model input names do not match",
         )

From b6b1dd069aaba54da82cfc9d29a310951a83cdbf Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 10:23:03 +0000
Subject: [PATCH 108/197] fix feat extractor tests

---
 .../clap/test_feature_extraction_clap.py      | 50 +------------------
 1 file changed, 2 insertions(+), 48 deletions(-)

diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
index 08a9b9578155..86dc5848934b 100644
--- a/tests/models/clap/test_feature_extraction_clap.py
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -15,15 +15,13 @@
 
 
 import itertools
-import os
 import random
-import tempfile
 import unittest
 
 import numpy as np
 
 from transformers import is_speech_available
-from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio
+from transformers.testing_utils import require_torch, require_torchaudio
 from transformers.utils.import_utils import is_torch_available
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
@@ -121,36 +119,6 @@ class CLAPFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes
     def setUp(self):
         self.feat_extract_tester = CLAPFeatureExtractionTester(self)
 
-    def test_feat_extract_from_and_save_pretrained(self):
-        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
-            check_json_file_has_correct_format(saved_file)
-            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
-
-        dict_first = feat_extract_first.to_dict()
-        dict_second = feat_extract_second.to_dict()
-        mel_1 = dict_first.pop("mel_filters")
-        mel_2 = dict_second.pop("mel_filters")
-        self.assertTrue(np.allclose(mel_1, mel_2))
-        self.assertEqual(dict_first, dict_second)
-
-    def test_feat_extract_to_json_file(self):
-        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
-            feat_extract_first.to_json_file(json_file_path)
-            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
-
-        dict_first = feat_extract_first.to_dict()
-        dict_second = feat_extract_second.to_dict()
-        mel_1 = dict_first.pop("mel_filters")
-        mel_2 = dict_second.pop("mel_filters")
-        self.assertTrue(np.allclose(mel_1, mel_2))
-        self.assertEqual(dict_first, dict_second)
-
     def test_call(self):
         # Tests that all call wrap to encode_plus and batch_encode_plus
         feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
@@ -160,9 +128,7 @@ def test_call(self):
 
         # Test feature size
         input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
-        self.assertTrue(input_features.ndim == 3)
-        self.assertTrue(input_features.shape[-1] == feature_extractor.nb_max_frames)
-        self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size)
+        self.assertTrue(input_features.ndim == 4)
 
         # Test not batched input
         encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
@@ -175,18 +141,6 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
-        # Test truncation required
-        speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples + 500), 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        speech_inputs_truncated = [x[: feature_extractor.n_samples] for x in speech_inputs]
-        np_speech_inputs_truncated = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
-
-        encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
     def test_double_precision_pad(self):
         import torch
 

From 1ce1a251614fca885c967c14f3bffbdee24ec2cb Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 10:26:56 +0000
Subject: [PATCH 109/197] update docs

---
 .../feature_extraction_sequence_utils.py      | 22 ++---
 .../models/clap/feature_extraction_clap.py    | 81 ++++++++-----------
 .../zero_shot_audio_classification.py         |  4 +-
 3 files changed, 46 insertions(+), 61 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 8128f194aa72..478d1925f43c 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -375,13 +375,14 @@ def hz_to_mel(freq: float, mel_scale: str = "htk") -> float:
         """Convert Hz to Mels.
 
         Args:
-            freqs (float):
+            freqs (`float`):
                 Frequencies in Hz
-            mel_scale (str, *optional*):
-                Scale to use: `htk` or `slaney`. (Default: `htk`)
+            mel_scale (`str`, *optional*, defaults to `"htk"`):
+                Scale to use, `htk` or `slaney`.
 
         Returns:
-            mels (float): Frequency in Mels
+            mels (`float`):
+                Frequency in Mels
         """
 
         if mel_scale not in ["slaney", "htk"]:
@@ -411,13 +412,13 @@ def mel_to_hz(mels: np.array, mel_scale: str = "htk") -> np.array:
         """Convert mel bin numbers to frequencies.
 
         Args:
-            mels (np.array):
+            mels (`np.array`):
                 Mel frequencies
-            mel_scale (str, *optional*, `"htk"`):
+            mel_scale (`str`, *optional*, `"htk"`):
                 Scale to use: `htk` or `slaney`.
 
         Returns:
-            freqs (np.array):
+            freqs (`np.array`):
                 Mels converted in Hz
         """
 
@@ -449,6 +450,8 @@ def create_triangular_filterbank(
     ) -> np.array:
         """Create a triangular filter bank.
 
+        #TODO this is the part that should be very well detailed
+
         Args:
             all_freqs (`np.array`):
                 STFT freq points of size (`n_freqs`).
@@ -558,7 +561,7 @@ def get_mel_filter_banks(
     def _stft(self, frames, window):
         """
         Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
-        results as `torch.stft`. #TODO @Arthur batching this could alloz more usage, good first issue.
+        results as `torch.stft`. #TODO @Arthur batching this could allow more usage, good first issue.
 
         Args:
             frames (`np.array` of dimension `(num_frames, self.n_fft)`):
@@ -624,7 +627,7 @@ def _fram_wave(self, waveform: np.array, center: bool = True):
 
         Return:
             framed_waveform (`np.array` of shape (waveform.shape // self.hop_length , self.n_fft)):
-                The framed waveforms that can be fed `np.fft`.
+                The framed waveforms that can be fed to `np.fft`.
         """
         # TODO: test if stereo audio works???
         frames = []
@@ -652,5 +655,4 @@ def _fram_wave(self, waveform: np.array, center: bool = True):
             frames.append(frame)
 
         frames = np.stack(frames, 0)
-
         return frames
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index a673ad9f2b3c..dd9789bb454d 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -58,25 +58,24 @@ class CLAPFeatureExtractor(SequenceFeatureExtractor):
         return_attention_mask (`bool`, *optional*, False):
             Whether or not the model should return the attention masks coresponding to the input.
         frequency_min (`float`, *optional*, 0):
-            The lowest frequency of interest. The STFT TODO (not sure) will not be computed for values below this.
+            The lowest frequency of interest. The STFT will not be computed for values below this.
         frequency_max (`float`, *optional*, 14_000):
-            The highest frequency of interest. The STFT TODO (not sure) will not be computed for values above this.
+            The highest frequency of interest. The STFT will not be computed for values above this.
         top_db (`float`, *optional*):
             The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the
             `SequenceFeatureExtractor._power_to_db` function
         truncation (`str`, *optional*, `"fusions"`):
             Truncation pattern for long audio inputs. Two patterns are available:
                 - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a
-                  downsampled version of the entire mel spectrogram. These 4 spectrogram will have a dimension of
-                  `n_fft, feature_size`. TODO check this
+                  downsampled version of the entire mel spectrogram.
             If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy
             of the original mel obtained from the padded audio.
                 - `rand_trunc` will select a random crop of the mel spectrogram.
-        padding (`str`, *optional*, `"repeatpad"`):
+        padding (`str`, *optional*):
             Padding pattern for shorter audio inputs. Three patterns were originaly implemented:
-                - `repeatpad`:
-                - `repeat`:
-                - `pad`:
+                - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
+                - `repeat`: the audio is repeated and then cut to fit the `max_length`
+                - `pad`: the audio is padded.
     """
 
     model_input_names = ["input_features", "is_longer"]
@@ -151,8 +150,14 @@ def to_dict(self) -> Dict[str, Any]:
 
     def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
         """
-        Compute the log-Mel spectrogram of the provided audio using the `hanning` window. Two different banks of filters were used:
-            - self.
+        Compute the log-Mel spectrogram of the provided `waveform` using the `hanning` window. In CLAP, two different
+        banks of filters are used depending on the truncation pattern:
+            - `self.mel_filters`: they correspond to the defaults parameters of `torchaduio` which can be obtained from
+              calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
+              is set to `fuison`.
+            - `self.mel_filteres_slanney` : they correspond to the defaults parameters of `torchlibrosa` which used
+              `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original
+              implementation when the truncation mode is not `"fusion"`.
         """
         window = np.hanning(self.n_fft + 1)[:-1]
         frames = self._fram_wave(waveform)
@@ -164,29 +169,6 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[n
         log_mel_spec = np.asarray(log_mel_spec, np.float32)
         return log_mel_spec
 
-    @staticmethod
-    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
-    def zero_mean_unit_var_norm(
-        input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
-    ) -> List[np.ndarray]:
-        """
-        Every array in the list is normalized to have zero mean and unit variance
-        """
-        if attention_mask is not None:
-            attention_mask = np.array(attention_mask, np.int32)
-            normed_input_values = []
-
-            for vector, length in zip(input_values, attention_mask.sum(-1)):
-                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
-                if length < normed_slice.shape[0]:
-                    normed_slice[length:] = padding_value
-
-                normed_input_values.append(normed_slice)
-        else:
-            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
-
-        return normed_input_values
-
     def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
         if len(ranges[1]) == 0:
@@ -196,29 +178,31 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
             # if the audio is too short, we just use the first chunk
             ranges[2] = [0]
         # randomly choose index for each part
-        idx_front = np.random.choice(ranges[0])  # 172
-        idx_middle = np.random.choice(ranges[1])  # 508
-        idx_back = np.random.choice(ranges[2])  # 1039
-        # select mel
+        idx_front = np.random.choice(ranges[0])
+        idx_middle = np.random.choice(ranges[1])
+        idx_back = np.random.choice(ranges[2])
+
         mel_chunk_front = mel[idx_front : idx_front + chunk_frames, :]
         mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]
         mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
 
-        mel_shrink = np_bilinear_resize(mel, chunk_frames, self.feature_size)  # current flags are probalby wrong
+        mel_shrink = np_bilinear_resize(mel, chunk_frames, self.feature_size)
         mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink], axis=0)
         return mel_fusion
 
     def _get_input_mel(self, waveform: np.array, max_length, truncation, padding) -> np.array:
         """
-        Possible cases :
-            - wave > max_length
-                - rand_trun
-                - fusion
-            - wave < max_length
-                - repeat
-                - fusion
-
-                TODO the max length should be 10x the sampling rate of the provided audio.
+        Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments.
+        Four different path are possible:
+            - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram
+              will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram
+              are then stacked together. They will later be used for `feature_fusion`.
+            - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is
+              padded based on `padding`.
+            - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded
+              based on `padding`, and is repeated `4` times.
+            - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel
+              spectrogram will be computed on a random crop of the waveform.
 
         """
         if waveform.shape[0] > max_length:
@@ -284,8 +268,7 @@ def __call__(
             truncation (`str`, *optional*):
                 Truncation pattern for long audio inputs. Two patterns are available:
                     - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and
-                      a downsampled version of the entire mel spectrogram. These 4 spectrogram will have a dimension of
-                      `n_fft, feature_size`. TODO check this
+                      a downsampled version of the entire mel spectrogram.
                 If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a
                 copy of the original mel obtained from the padded audio.
                     - `rand_trunc` will select a random crop of the mel spectrogram.
diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
index 9a839ee36893..7da8766c69ca 100644
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -80,7 +80,7 @@ def __call__(
             candidate_labels (`List[str]`):
                 The candidate labels for this audio
 
-            hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`):
+            hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
                 The sentence used in cunjunction with *candidate_labels* to attempt the audio classification by
                 replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
                 logits_per_audio
@@ -103,7 +103,7 @@ def _sanitize_parameters(self, **kwargs):
 
         return preprocess_params, {}, {}
 
-    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a recording of {}."):
+    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a sound of {}."):
         if isinstance(audio, str):
             if audio.startswith("http://") or audio.startswith("https://"):
                 # We need to actually check for a real protocol, otherwise it's impossible to use a local file

From d3b2d941410ad8c124a840539ab0900775a9cdcd Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 9 Feb 2023 12:00:11 +0100
Subject: [PATCH 110/197] Apply suggestions from code review

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 README.md         | 1 -
 README_es.md      | 1 -
 README_hd.md      | 1 -
 README_ja.md      | 1 -
 README_ko.md      | 1 -
 README_zh-hans.md | 1 -
 README_zh-hant.md | 1 -
 7 files changed, 7 deletions(-)

diff --git a/README.md b/README.md
index eee0c9bc3efe..775e50ccc6f7 100644
--- a/README.md
+++ b/README.md
@@ -295,7 +295,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_es.md b/README_es.md
index d10f769e0638..5d32c70769f4 100644
--- a/README_es.md
+++ b/README_es.md
@@ -288,7 +288,6 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_hd.md b/README_hd.md
index 338710693c29..31e697cb26c2 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -260,7 +260,6 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा।
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा।
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (सेल्सफोर्स से) साथ में पेपर [प्रोग्राम सिंथेसिस के लिए एक संवादात्मक प्रतिमान](https://arxiv.org/abs/2203.13474) एरिक निजकैंप, बो पैंग, हिरोआकी हयाशी, लिफू तू, हुआन वांग, यिंगबो झोउ, सिल्वियो सावरेस, कैमिंग जिओंग रिलीज।
diff --git a/README_ja.md b/README_ja.md
index 564e96595510..6ddcffe11595 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -322,7 +322,6 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)
diff --git a/README_ko.md b/README_ko.md
index a219b90e4cb2..6075dfdb3c75 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -237,7 +237,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce 에서) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 의 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index d618688014dc..bdd7da05fc1a 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -261,7 +261,6 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 9f707a80e580..5a40b1656287 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -273,7 +273,6 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.

From 7ed1747cbaf7bf7f6d15387e134cc48ec3c40440 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 11:03:02 +0000
Subject: [PATCH 111/197] fix readmes

---
 README.md                | 3 ++-
 README_es.md             | 3 ++-
 README_hd.md             | 3 ++-
 README_ja.md             | 3 ++-
 README_ko.md             | 3 ++-
 README_zh-hans.md        | 3 ++-
 README_zh-hant.md        | 3 ++-
 docs/source/en/index.mdx | 2 +-
 8 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 775e50ccc6f7..abe0cb728794 100644
--- a/README.md
+++ b/README.md
@@ -294,7 +294,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_es.md b/README_es.md
index 5d32c70769f4..4ea342470fd3 100644
--- a/README_es.md
+++ b/README_es.md
@@ -287,7 +287,8 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_hd.md b/README_hd.md
index 31e697cb26c2..9721fff14aff 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -259,7 +259,8 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (इनरिया/फेसबुक/सोरबोन से) साथ में कागज [CamemBERT: एक टेस्टी फ्रेंच लैंग्वेज मॉडल](https:// arxiv.org/abs/1911.03894) लुई मार्टिन*, बेंजामिन मुलर*, पेड्रो जेवियर ऑर्टिज़ सुआरेज़*, योआन ड्यूपॉन्ट, लॉरेंट रोमरी, एरिक विलेमोन्टे डे ला क्लर्जरी, जैमे सेडाह और बेनोइट सगोट द्वारा।
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा।
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा।
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (सेल्सफोर्स से) साथ में पेपर [प्रोग्राम सिंथेसिस के लिए एक संवादात्मक प्रतिमान](https://arxiv.org/abs/2203.13474) एरिक निजकैंप, बो पैंग, हिरोआकी हयाशी, लिफू तू, हुआन वांग, यिंगबो झोउ, सिल्वियो सावरेस, कैमिंग जिओंग रिलीज।
diff --git a/README_ja.md b/README_ja.md
index 6ddcffe11595..cbe060de91c7 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -321,7 +321,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)
diff --git a/README_ko.md b/README_ko.md
index 6075dfdb3c75..949857d07177 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -236,7 +236,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne 에서) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 의 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 논문과 함께 발표했습니다.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce 에서) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 의 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index bdd7da05fc1a..752daf27f1b8 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -260,7 +260,8 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 5a40b1656287..20113ca06ebb 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -272,7 +272,8 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 6f6a46a911ce..ba4f1b224694 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -73,7 +73,7 @@ The documentation is organized into five sections:
 1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[CLAP](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[clap](model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.

From 6b2a6dbe28ccf8b13b2132ee0faf920225a2bbd8 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 11:04:00 +0000
Subject: [PATCH 112/197] fix tips

---
 docs/source/en/model_doc/clap.mdx | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/source/en/model_doc/clap.mdx b/docs/source/en/model_doc/clap.mdx
index 9f2ced3f109e..7a756e3375ee 100644
--- a/docs/source/en/model_doc/clap.mdx
+++ b/docs/source/en/model_doc/clap.mdx
@@ -23,10 +23,6 @@ The abstract from the paper is the following:
 
 *Contrastive learning has shown remarkable success in the field of multimodal representation learning. In this paper, we propose a pipeline of contrastive language-audio pretraining to develop an audio representation by combining audio data with natural language descriptions. To accomplish this target, we first release LAION-Audio-630K, a large collection of 633,526 audio-text pairs from different data sources. Second, we construct a contrastive language-audio pretraining model by considering different audio encoders and text encoders. We incorporate the feature fusion mechanism and keyword-to-caption augmentation into the model design to further enable the model to process audio inputs of variable lengths and enhance the performance. Third, we perform comprehensive experiments to evaluate our model across three tasks: text-to-audio retrieval, zero-shot audio classification, and supervised audio classification. The results demonstrate that our model achieves superior performance in text-to-audio retrieval task. In audio classification tasks, the model achieves state-of-the-art performance in the zeroshot setting and is able to obtain performance comparable to models' results in the non-zero-shot setting. LAION-Audio-6*
 
-Tips:
-
-- TODOS
-
 This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArtZucker) .
 The original code can be found [here](https://github.com/LAION-AI/CLAP).
 

From b6fa932afb8783f5c4aec534c3bc556a5f8e09ae Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Thu, 9 Feb 2023 15:04:49 +0100
Subject: [PATCH 113/197] Update
 src/transformers/models/auto/configuration_auto.py

---
 src/transformers/models/auto/configuration_auto.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 027054ed716c..7cc636e16281 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -378,7 +378,7 @@
         ("camembert", "CamemBERT"),
         ("canine", "CANINE"),
         ("chinese_clip", "Chinese-CLIP"),
-        ("clap", "clap"),
+        ("clap", "CLAP"),
         ("clip", "CLIP"),
         ("clipseg", "CLIPSeg"),
         ("codegen", "CodeGen"),

From 62ffdf7b4873544b00ee81bd8e7c793fda04fce4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 14:08:40 +0000
Subject: [PATCH 114/197] update doc and remove todo -> properly explained

---
 .../feature_extraction_sequence_utils.py      | 22 +++++------
 src/transformers/image_transforms.py          | 39 ++++++++++++-------
 .../models/clap/feature_extraction_clap.py    |  4 +-
 3 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 478d1925f43c..368c1777089b 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -450,7 +450,6 @@ def create_triangular_filterbank(
     ) -> np.array:
         """Create a triangular filter bank.
 
-        #TODO this is the part that should be very well detailed
 
         Args:
             all_freqs (`np.array`):
@@ -495,15 +494,11 @@ def get_mel_filter_banks(
 
         Note:
             Different banks of MEL filters were introduced in the litterature. The following variation are supported:
-                - MFCC FB-20: introduced in 1980 by Davis and Mermelstein [4]; Davis and Mermelstein assume sampling
-                  frequency of 10 kHz; speech bandwidth [0, 4600] Hz
-                - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) described in Young, 1995 [5]; Young uses a
-                  filter bank of 24 filters for speech bandwidth [0, 8000] Hz (sampling rate ≥ 16 kHz)
-                - MFCC FB-40: from the Auditory Toolbox for MATLAB [6] written by Slaney in 1998; Slaney assumes
-                  sampling rate of 16 kHz, and speech bandwidth [133, 6854] Hz
-                - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris, 2004 [3]; Skowronski and
-                  Harris assume sampling rate of 12.5 kHz and speech bandwidth [0, 6250] Hz
-
+                - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech bandwidth of `[0, 4600]` Hz
+                - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech bandwidth `[0, 8000]` Hz (sampling rate ≥ 16 kHz).
+                - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz, and speech bandwidth [133, 6854] Hz. This version also includes an area normalization.
+                - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes sampling rate of 12.5 kHz and speech bandwidth [0, 6250] Hz
+            The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa` uses the `"slaney"` implementation.
 
         Args:
             n_freqs (`int`):
@@ -598,6 +593,12 @@ def _power_to_db(self, mel_spectrogram, a_min=1e-10, ref=1.0):
         """
         Convert a mel spectrogram from power to db scale, this function is the numpy implementation of
         librosa.power_to_lb.
+        
+        Note: 
+            The motivation behind applying the log function on the mel spectrgram is that humans do not hear loudness on a linear scale.
+            Generally to double the percieved volume of a sound we need to put 8 times as much energy into it. This means that large variations 
+            in energy may not sound all that different if the sound is loud to begin with. This compression operation makes the mel features match
+            more closely what humans actually hear.
         """
         log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
         log_spec -= 10.0 * np.log10(np.maximum(a_min, ref))
@@ -629,7 +630,6 @@ def _fram_wave(self, waveform: np.array, center: bool = True):
             framed_waveform (`np.array` of shape (waveform.shape // self.hop_length , self.n_fft)):
                 The framed waveforms that can be fed to `np.fft`.
         """
-        # TODO: test if stereo audio works???
         frames = []
         for i in range(0, waveform.shape[0] + 1, self.hop_length):
             half_window = (self.n_fft - 1) // 2 + 1
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 50dcc4459970..9c1b087c7331 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -710,18 +710,29 @@ def convert_to_rgb(image: ImageInput) -> ImageInput:
     return image
 
 
-def bilinear_interpolation(image, y, x):
+def bilinear_interpolation(image: np.ndarray, y:float, x:float):
+    # fmt: off
     """
     A bilinear interpolation of the estimated values of the `image` at non integer indexes `y` and `x`.
     
-    Original Image at Original Image at
-      x_1, y_1 x_1, y_2
-        +---+ +---+ | +-|-------------------------------|-+ | +---+ +---+
-            | | | Pixel at (x,y) where | | x and y non integers | | +---+ | | | | | | +---+ |
-        +---+ +---+ | +-|-------------------------------|-+ | +---+ +---+
+    
+    Original Image at                    Original Image at
+      x_1, y_1                             x_1, y_2 
+        +---+                               +---+       
+        | +-|-------------------------------|-+ |       
+        +---+                               +---+       
+            |                                   |         
+            |             Pixel at (x,y) where  |         
+            |             x and y non integers  |         
+            |                     +---+         |         
+            |                     |   |         |         
+            |                     +---+         |         
+        +---+                               +---+       
+        | +-|-------------------------------|-+ |       
+        +---+                               +---+       
                                                         
-    Original Image at Original Image at
-      x_1, y_2 x_2, y_2
+    Original Image at                    Original Image at
+      x_1, y_2                             x_2, y_2 
     
     The estimated value of the pixel is computed using the following equation :
     
@@ -732,6 +743,7 @@ def bilinear_interpolation(image, y, x):
     For more details about bilinear interplation, see [on the wikipedia
     page](https://en.wikipedia.org/wiki/Bilinear_interpolation)
     """
+    # fmt: on
     height = image.shape[0]
     width = image.shape[1]
 
@@ -755,16 +767,15 @@ def bilinear_interpolation(image, y, x):
     return new_pixel
 
 
-def np_bilinear_resize(image, new_height, new_width):
+def np_bilinear_resize(image:np.ndarray, new_height:int, new_width:int):
     """
     Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)`
-    with the torchvision.transforms.Resize(size=[chunk_frames, self.feature_size]) This function is not optimal in
-    terms of performances, but has the same results as the `torchvision.transforms.resize` function when called with
+    this is the equivalent of the `torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])`. This function is not optimal in
+    terms of performances, but has the same results as `torchvision` counterpart when called with
     the default `bilinear` interpolation.
     """
-    new_image = np.zeros(
-        (new_height, new_width), image.dtype
-    )  # new_image = [[0 for _ in range(new_width)] for _ in range(new_height)]
+    # new_image = [[0 for _ in range(new_width)] for _ in range(new_height)]
+    new_image = np.zeros((new_height, new_width), image.dtype) 
 
     orig_height = image.shape[0]
     orig_width = image.shape[1]
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index dd9789bb454d..ddc4531993d3 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -52,7 +52,7 @@ class CLAPFeatureExtractor(SequenceFeatureExtractor):
         chunk_length_s (`int`, defaults to 10):
             The maximum input lenght of the model in seconds. This is used to pad the audio.
         n_fft (`int`, defaults to 400):
-            Size of the Fourier transform. TODO will properly explain this
+            Size of the Fourier transform. This should be the length of a single frame in samples. 400 means that the fourrier transform is computed on 400 samples.
         padding_value (`float`, *optional*, defaults to 0.0):
             Padding value used to pad the audio. Should correspond to silences.
         return_attention_mask (`bool`, *optional*, False):
@@ -155,7 +155,7 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[n
             - `self.mel_filters`: they correspond to the defaults parameters of `torchaduio` which can be obtained from
               calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
               is set to `fuison`.
-            - `self.mel_filteres_slanney` : they correspond to the defaults parameters of `torchlibrosa` which used
+            - `self.mel_filteres_slaney` : they correspond to the defaults parameters of `torchlibrosa` which used
               `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original
               implementation when the truncation mode is not `"fusion"`.
         """

From 8868e3c1606a3066d3d4dd33d0be092ffa1b79ba Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 14:12:11 +0000
Subject: [PATCH 115/197] fix idx and typo

---
 docs/source/en/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index ba4f1b224694..535a819f779a 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -259,7 +259,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             clap              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CLAP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |

From 9b9bd4bfd5e1932ae036abb50f538708742e68e6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 14:12:22 +0000
Subject: [PATCH 116/197] typoe

---
 src/transformers/feature_extraction_sequence_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 368c1777089b..722eac056c0f 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -595,7 +595,7 @@ def _power_to_db(self, mel_spectrogram, a_min=1e-10, ref=1.0):
         librosa.power_to_lb.
         
         Note: 
-            The motivation behind applying the log function on the mel spectrgram is that humans do not hear loudness on a linear scale.
+            The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness on a linear scale.
             Generally to double the percieved volume of a sound we need to put 8 times as much energy into it. This means that large variations 
             in energy may not sound all that different if the sound is loud to begin with. This compression operation makes the mel features match
             more closely what humans actually hear.
@@ -627,7 +627,7 @@ def _fram_wave(self, waveform: np.array, center: bool = True):
                 waveform on the left and on the right.
 
         Return:
-            framed_waveform (`np.array` of shape (waveform.shape // self.hop_length , self.n_fft)):
+            framed_waveform (`np.array` of shape (`waveform.shape // self.hop_length , self.n_fft)`):
                 The framed waveforms that can be fed to `np.fft`.
         """
         frames = []

From 9460eaa361942f1dc1b42d823174e9542543e8b3 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 14:52:22 +0000
Subject: [PATCH 117/197] cleanup config

---
 .../models/clap/configuration_clap.py         | 56 +++++++++----------
 1 file changed, 25 insertions(+), 31 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 3193d4422603..5a733afc0d8a 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -164,8 +164,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 class CLAPAudioConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`CLAPAudioModel`]. It is used to instantiate a
-    CLAP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLAP
+    CLAP audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of the CLAP
     [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -174,12 +174,15 @@ class CLAPAudioConfig(PretrainedConfig):
     Args:
         window_size (`int`, *optional*, defaults to 8):
             [description]
-        mel_bins (`int`, *optional*, defaults to 64):
-            [description]
+        num_mel_bins (`int`, *optional*, defaults to 64):
+            Number of mel features used per frames. Should correspond to the value used in the `CLAPProcessor` class.
         spec_size (`int`, *optional*, defaults to 256):
-            [description]
+            Desired input size of the spectrogram that the model supports. It can be different from the output of the
+            `CLAPFeatureExtractor`, in which case the input features will be resized. Corresponds to the `image_size`
+            of the audio models.
         hidden_act (`str`, *optional*, defaults to `"gelu"`):
-            [description]
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
         patch_size (`int`, *optional*, defaults to 4):
             [description]
         patch_stride (`list`, *optional*, defaults to `[4, 4]`):
@@ -195,15 +198,12 @@ class CLAPAudioConfig(PretrainedConfig):
         num_attention_heads (`list`, *optional*, defaults to `[4, 8, 16, 32]`):
             [description]
         enable_fusion (`bool`, *optional*, defaults to `False`):
-            [description]
+            Whether or not to enable patch fusion. This is the main contribution of the authors, and should give the
+            best results. Patch fusion will #TODO describe what it does
         hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             [description]
         fusion_type (`[type]`, *optional*):
             [description]
-        image_size (`int`, *optional*, defaults to 224):
-            [description]
-        input_channels (`int`, *optional*, defaults to 3):
-            [description]
         patch_embed_input_channels (`int`, *optional*, defaults to 1):
             [description]
         flatten_patch_embeds (`bool`, *optional*, defaults to `True`):
@@ -224,14 +224,14 @@ class CLAPAudioConfig(PretrainedConfig):
             [description]
         num_hidden_layers (`int`, *optional*, defaults to 4):
             [description]
-        enable_patch_fusion (`bool`, *optional*, defaults to `False`):
-            [description]
         projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
-            [description]
+            The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
         layer_norm_eps (`[type]`, *optional*, defaults to `1e-5`):
-            [description]
+            The epsilon used by the layer normalization layers.
         initializer_factor (`float`, *optional*, defaults to 1.0):
-            [description]
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
 
     Example:
 
@@ -253,7 +253,7 @@ class CLAPAudioConfig(PretrainedConfig):
     def __init__(
         self,
         window_size=8,
-        mel_bins=64,
+        num_mel_bins=64,
         spec_size=256,
         hidden_act="gelu",
         patch_size=4,
@@ -266,8 +266,6 @@ def __init__(
         enable_fusion=False,
         hidden_dropout_prob=0.1,
         fusion_type=None,
-        image_size=224,
-        input_channels=3,
         patch_embed_input_channels=1,
         flatten_patch_embeds=True,
         patch_embeds_hidden_size=96,
@@ -278,7 +276,6 @@ def __init__(
         mlp_ratio=4.0,
         aff_block_r=4,
         num_hidden_layers=4,
-        enable_patch_fusion=False,
         projection_hidden_act="relu",
         layer_norm_eps=1e-5,
         initializer_factor=1.0,
@@ -286,7 +283,7 @@ def __init__(
     ):
         super().__init__(**kwargs)
         self.window_size = window_size
-        self.mel_bins = mel_bins
+        self.num_mel_bins = num_mel_bins
         self.spec_size = spec_size
         self.patch_size = patch_size
         self.patch_stride = patch_stride
@@ -301,8 +298,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
         self.projection_hidden_size = projection_hidden_size
-        self.image_size = image_size
-        self.input_channels = input_channels
         self.flatten_patch_embeds = flatten_patch_embeds
         self.patch_embeds_hidden_size = patch_embeds_hidden_size
         self.enable_patch_layer_norm = enable_patch_layer_norm
@@ -312,7 +307,6 @@ def __init__(
         self.mlp_ratio = mlp_ratio
         self.patch_embed_input_channels = patch_embed_input_channels
         self.aff_block_r = aff_block_r
-        self.enable_patch_fusion = enable_patch_fusion
         self.layer_norm_eps = layer_norm_eps
         self.initializer_factor = initializer_factor
         self.projection_hidden_act = projection_hidden_act
@@ -321,7 +315,7 @@ def __init__(
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
-        # get the vision config dict if we are loading from CLAPConfig
+        # get the audio config dict if we are loading from CLAPConfig
         if config_dict.get("model_type") == "clap":
             config_dict = config_dict["audio_config"]
 
@@ -337,8 +331,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 class CLAPConfig(PretrainedConfig):
     r"""
     [`CLAPConfig`] is the configuration class to store the configuration of a [`CLAPModel`]. It is used to instantiate
-    a CLAP model according to the specified arguments, defining the text model and vision model configs. Instantiating
-    a configuration with the defaults will yield a similar configuration to that of the CLAP
+    a CLAP model according to the specified arguments, defining the text model and audio model configs. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the CLAP
     [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -350,7 +344,7 @@ class CLAPConfig(PretrainedConfig):
         audio_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`CLAPAudioConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
-            Dimentionality of text and vision projection layers.
+            Dimentionality of text and audio projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
             The inital value of the *logit_scale* paramter. Default is used as per the original CLAP implementation.
         fusion_num_hidden_layers (`int`, *optional*, defaults to 2):
@@ -383,9 +377,9 @@ class CLAPConfig(PretrainedConfig):
 
     >>> # Initializing a CLAPText and CLAPAudioConfig configuration
     >>> config_text = CLAPTextConfig()
-    >>> config_vision = CLAPAudioConfig()
+    >>> config_audio = CLAPAudioConfig()
 
-    >>> config = CLAPConfig.from_text_audio_configs(config_text, config_vision)
+    >>> config = CLAPConfig.from_text_audio_configs(config_text, config_audio)
     ```"""
 
     model_type = "clap"
@@ -443,7 +437,7 @@ def __init__(
     @classmethod
     def from_text_audio_configs(cls, text_config: CLAPTextConfig, audio_config: CLAPAudioConfig, **kwargs):
         r"""
-        Instantiate a [`CLAPConfig`] (or a derived class) from clap text model configuration and clap vision model
+        Instantiate a [`CLAPConfig`] (or a derived class) from clap text model configuration and clap audio model
         configuration.
 
         Returns:

From 82f4ede09aa72bf2da3405a6292787f363a90c49 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 14:52:56 +0000
Subject: [PATCH 118/197] cleanup tests, styles and doc

---
 .../feature_extraction_sequence_utils.py      | 28 ++++---
 .../models/clap/feature_extraction_clap.py    |  3 +-
 src/transformers/models/clap/modeling_clap.py | 72 +++++++++--------
 tests/models/clap/test_modeling_clap.py       | 14 ++--
 ...ipelines_zero_shot_audio_classification.py | 81 +------------------
 5 files changed, 70 insertions(+), 128 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 722eac056c0f..31306ef9e135 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -494,11 +494,16 @@ def get_mel_filter_banks(
 
         Note:
             Different banks of MEL filters were introduced in the litterature. The following variation are supported:
-                - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech bandwidth of `[0, 4600]` Hz
-                - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech bandwidth `[0, 8000]` Hz (sampling rate ≥ 16 kHz).
-                - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz, and speech bandwidth [133, 6854] Hz. This version also includes an area normalization.
-                - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes sampling rate of 12.5 kHz and speech bandwidth [0, 6250] Hz
-            The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa` uses the `"slaney"` implementation.
+                - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz
+                  and a speech bandwidth of `[0, 4600]` Hz
+                - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a
+                  speech bandwidth `[0, 8000]` Hz (sampling rate ≥ 16 kHz).
+                - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate
+                  of 16 kHz, and speech bandwidth [133, 6854] Hz. This version also includes an area normalization.
+                - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes sampling
+                  rate of 12.5 kHz and speech bandwidth [0, 6250] Hz
+            The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa`
+            uses the `"slaney"` implementation.
 
         Args:
             n_freqs (`int`):
@@ -593,12 +598,13 @@ def _power_to_db(self, mel_spectrogram, a_min=1e-10, ref=1.0):
         """
         Convert a mel spectrogram from power to db scale, this function is the numpy implementation of
         librosa.power_to_lb.
-        
-        Note: 
-            The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness on a linear scale.
-            Generally to double the percieved volume of a sound we need to put 8 times as much energy into it. This means that large variations 
-            in energy may not sound all that different if the sound is loud to begin with. This compression operation makes the mel features match
-            more closely what humans actually hear.
+
+        Note:
+            The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness
+            on a linear scale. Generally to double the percieved volume of a sound we need to put 8 times as much
+            energy into it. This means that large variations in energy may not sound all that different if the sound is
+            loud to begin with. This compression operation makes the mel features match more closely what humans
+            actually hear.
         """
         log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
         log_spec -= 10.0 * np.log10(np.maximum(a_min, ref))
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index ddc4531993d3..d12207650761 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -52,7 +52,8 @@ class CLAPFeatureExtractor(SequenceFeatureExtractor):
         chunk_length_s (`int`, defaults to 10):
             The maximum input lenght of the model in seconds. This is used to pad the audio.
         n_fft (`int`, defaults to 400):
-            Size of the Fourier transform. This should be the length of a single frame in samples. 400 means that the fourrier transform is computed on 400 samples.
+            Size of the Fourier transform. This should be the length of a single frame in samples. 400 means that the
+            fourrier transform is computed on 400 samples.
         padding_value (`float`, *optional*, defaults to 0.0):
             Padding value used to pad the audio. Should correspond to silences.
         return_attention_mask (`bool`, *optional*, False):
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index af2a0386ff63..e0900c109730 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The LAION-AI Team Authors and The HuggingFace Team. All rights reserved.
+# Copyright 2023 The LAION-AI Team and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -373,13 +373,11 @@ def __init__(self, config: CLAPAudioConfig):
         self.num_patches = self.grid_size[0] * self.grid_size[1]
 
         self.flatten = config.flatten_patch_embeds
-        self.enable_patch_fusion = config.enable_patch_fusion
         self.enable_fusion = config.enable_fusion
-        self.fusion_type = config.fusion_type
 
         padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)
 
-        scale_factor = 4 if (self.enable_fusion) and (self.fusion_type == "channel_map") else 1
+        scale_factor = 4 if (self.enable_fusion) and (config.fusion_type == "channel_map") else 1
 
         self.proj = nn.Conv2d(
             config.patch_embed_input_channels * scale_factor,
@@ -390,7 +388,7 @@ def __init__(self, config: CLAPAudioConfig):
         )
 
         self.norm = nn.LayerNorm(config.patch_embeds_hidden_size) if config.enable_patch_layer_norm else nn.Identity()
-        if self.enable_patch_fusion:
+        if self.enable_fusion:
             self.fusion_model = CLAPAudioAFFBlock(config)
             self.mel_conv2d = nn.Conv2d(
                 config.patch_embed_input_channels,
@@ -899,10 +897,10 @@ def __init__(self, config):
         grid_size = self.patch_embed.grid_size
         self.patch_stride = self.patch_embed.patch_stride
         self.spec_size = config.spec_size
-        self.freq_ratio = self.spec_size // config.mel_bins
+        self.freq_ratio = self.spec_size // config.num_mel_bins
 
         self.num_features = int(config.hidden_size * 2 ** (self.num_layers - 1))
-        self.freq_ratio = config.spec_size // config.mel_bins
+        self.freq_ratio = config.spec_size // config.num_mel_bins
 
         dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
 
@@ -925,7 +923,7 @@ def __init__(self, config):
 
         self.gradient_checkpointing = False
 
-        self.bn0 = nn.BatchNorm2d(config.mel_bins)
+        self.bn0 = nn.BatchNorm2d(config.num_mel_bins)
         self.norm = nn.LayerNorm(self.num_features)
         self.depths = config.depths
 
@@ -941,43 +939,53 @@ def __init__(self, config):
         )
         self.head = nn.Linear(config.num_classes, config.num_classes)
 
-    def reshape_wav2img(self, hidden_states):
-        _, _, time_steps, freq_steps = hidden_states.shape
+    def reshape_mel2img(self, normalixed_input_features):
+        """
+        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
+        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the `CLAPFeatureExtracor`.
+        """
+        _, _, time_steps, freq_steps = normalixed_input_features.shape
 
         target_T = int(self.spec_size * self.freq_ratio)
         target_F = self.spec_size // self.freq_ratio
 
         if time_steps > target_T or freq_steps > target_F:
-            raise ValueError("the wav size should less than or equal to the swin input size")
+            raise ValueError("the wav size should be less than or equal to the swin input size")
 
         # to avoid bicubic zero error
         if time_steps < target_T:
-            hidden_states = nn.functional.interpolate(
-                hidden_states, (target_T, hidden_states.shape[3]), mode="bicubic", align_corners=True
+            normalixed_input_features = nn.functional.interpolate(
+                normalixed_input_features,
+                (target_T, normalixed_input_features.shape[3]),
+                mode="bicubic",
+                align_corners=True,
             )
         if freq_steps < target_F:
-            hidden_states = nn.functional.interpolate(
-                hidden_states, (hidden_states.shape[2], target_F), mode="bicubic", align_corners=True
+            normalixed_input_features = nn.functional.interpolate(
+                normalixed_input_features,
+                (normalixed_input_features.shape[2], target_F),
+                mode="bicubic",
+                align_corners=True,
             )
 
-        hidden_states = hidden_states.permute(0, 1, 3, 2).contiguous()
-        hidden_states = hidden_states.reshape(
-            hidden_states.shape[0],
-            hidden_states.shape[1],
-            hidden_states.shape[2],
+        normalixed_input_features = normalixed_input_features.permute(0, 1, 3, 2).contiguous()
+        normalixed_input_features = normalixed_input_features.reshape(
+            normalixed_input_features.shape[0],
+            normalixed_input_features.shape[1],
+            normalixed_input_features.shape[2],
             self.freq_ratio,
-            hidden_states.shape[3] // self.freq_ratio,
+            normalixed_input_features.shape[3] // self.freq_ratio,
         )
 
-        hidden_states = hidden_states.permute(0, 1, 3, 2, 4).contiguous()
-        hidden_states = hidden_states.reshape(
-            hidden_states.shape[0],
-            hidden_states.shape[1],
-            hidden_states.shape[2] * hidden_states.shape[3],
-            hidden_states.shape[4],
+        normalixed_input_features = normalixed_input_features.permute(0, 1, 3, 2, 4).contiguous()
+        normalixed_input_features = normalixed_input_features.reshape(
+            normalixed_input_features.shape[0],
+            normalixed_input_features.shape[1],
+            normalixed_input_features.shape[2] * normalixed_input_features.shape[3],
+            normalixed_input_features.shape[4],
         )
 
-        return hidden_states
+        return normalixed_input_features
 
     def forward(
         self,
@@ -991,15 +999,15 @@ def forward(
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, CLAPAudioModelOutput]:
         input_features = input_features.transpose(1, 3)
-        hidden_states = self.bn0(input_features)
-        hidden_states = hidden_states.transpose(1, 3)
+        normalixed_input_features = self.bn0(input_features)
+        normalixed_input_features = normalixed_input_features.transpose(1, 3)
 
         is_longer_list_idx = None
         if self.enable_fusion:
             is_longer_list = is_longer.to(input_features.device)
             is_longer_list_idx = torch.where(is_longer_list == 0)[0]
 
-        hidden_states = self.reshape_wav2img(hidden_states)
+        hidden_states = self.reshape_mel2img(normalixed_input_features)
 
         frames_num = hidden_states.shape[2]
 
@@ -1013,7 +1021,7 @@ def forward(
 
         if output_hidden_states:
             batch_size, _, hidden_size = hidden_states.shape
-            # rearrange b (h w) c -> b c h w
+            # rearrange batch_size (height width) channels -> batch_size channel height width
             reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
             reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
             all_hidden_states += (hidden_states,)
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index d2809afe7e0f..8015e279a124 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -78,7 +78,7 @@ def __init__(
         parent,
         batch_size=12,
         image_size=60,
-        mel_bins=16,
+        num_mel_bins=16,
         window_size=4,
         spec_size=64,
         patch_size=2,
@@ -102,7 +102,7 @@ def __init__(
         self.parent = parent
         self.batch_size = batch_size
         self.image_size = image_size
-        self.mel_bins = mel_bins
+        self.num_mel_bins = num_mel_bins
         self.window_size = window_size
         self.patch_size = patch_size
         self.num_channels = num_channels
@@ -125,7 +125,7 @@ def __init__(
         self.scope = scope
 
     def prepare_config_and_inputs(self):
-        input_features = floats_tensor([self.batch_size, 1, self.hidden_size, self.mel_bins])
+        input_features = floats_tensor([self.batch_size, 1, self.hidden_size, self.num_mel_bins])
         config = self.get_config()
 
         return config, input_features
@@ -134,7 +134,7 @@ def get_config(self):
         return CLAPAudioConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
-            mel_bins=self.mel_bins,
+            num_mel_bins=self.num_mel_bins,
             window_size=self.window_size,
             num_channels=self.num_channels,
             hidden_size=self.hidden_size,
@@ -199,7 +199,7 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="CLAP does not use inputs_embeds")
+    @unittest.skip(reason="CLAPAudioModel does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
@@ -245,7 +245,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip(reason="CLAPAudio does not output any loss term in the forward pass")
+    @unittest.skip(reason="CLAPAudioModel does not output any loss term in the forward pass")
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
@@ -269,9 +269,11 @@ def test_model_with_projection(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
 
+    @unittest.skip(reason="CLAPAudioModel does not output any loss term in the forward pass")
     def test_training(self):
         pass
 
+    @unittest.skip(reason="CLAPAudioModel does not output any loss term in the forward pass")
     def test_training_gradient_checkpointing(self):
         pass
 
diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index 9eb684890728..5acd07ffab4f 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -1,4 +1,4 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 from datasets import load_dataset
 
 from transformers.pipelines import pipeline
-from transformers.testing_utils import nested_simplify, require_torch
+from transformers.testing_utils import nested_simplify, require_torch, slow
 
 from .test_pipelines_common import PipelineTestCaseMeta
 
@@ -28,30 +28,6 @@ class ZeroShotAudioClassificationPipelineTests(unittest.TestCase, metaclass=Pipe
     # and only CLAP would be there for now.
     # model_mapping = {CLAPConfig: CLAPModel}
 
-    # def get_test_pipeline(self, model, tokenizer, processor):
-    #     if tokenizer is None:
-    #         # Side effect of no Fast Tokenizer class for these model, so skipping
-    #         # But the slow tokenizer test should still run as they're quite small
-    #         self.skipTest("No tokenizer available")
-    #         return
-    #         # return None, None
-
-    #     audio_classifier = ZeroShotAudioClassificationPipeline(
-    #         model=model, tokenizer=tokenizer, feature_extractor=processor
-    #     )
-
-    #     # test with a raw waveform
-    #     audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    #     audio2 = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    #     return audio_classifier, [audio, audio2]
-
-    # def run_pipeline_test(self, pipe, examples):
-    #     audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    #     outputs = pipe(audio, candidate_labels=["A", "B"])
-    #     self.assertEqual(outputs, {"text": ANY(str)})
-
-    #     # Batching
-    #     outputs = pipe([audio] * 3, batch_size=2, candidate_labels=["A", "B"])
     @require_torch
     def test_small_model_pt(self):
         pass
@@ -59,58 +35,7 @@ def test_small_model_pt(self):
     def test_small_model_tf(self):
         pass
 
-    # @require_torch
-    # def test_small_model_pt(self):
-    #     audio_classifier = pipeline(
-    #         model="hf-internal-testing/tiny-random-clap-zero-shot-audio-classification",
-    #     )
-    #     audio = Audio.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    #     output = audio_classifier(audio, candidate_labels=["a", "b", "c"])
-
-    #     self.assertEqual(
-    #         nested_simplify(output),
-    #         [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
-    #     )
-
-    #     output = audio_classifier([audio] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
-    #     self.assertEqual(
-    #         nested_simplify(output),
-    #         # Pipeline outputs are supposed to be deterministic and
-    #         # So we could in theory have real values "A", "B", "C" instead
-    #         # of ANY(str).
-    #         # However it seems that in this particular case, the floating
-    #         # scores are so close, we enter floating error approximation
-    #         # and the order is not guaranteed anymore with batching.
-    #         [
-    #             [
-    #                 {"score": 0.333, "label": ANY(str)},
-    #                 {"score": 0.333, "label": ANY(str)},
-    #                 {"score": 0.333, "label": ANY(str)},
-    #             ],
-    #             [
-    #                 {"score": 0.333, "label": ANY(str)},
-    #                 {"score": 0.333, "label": ANY(str)},
-    #                 {"score": 0.333, "label": ANY(str)},
-    #             ],
-    #             [
-    #                 {"score": 0.333, "label": ANY(str)},
-    #                 {"score": 0.333, "label": ANY(str)},
-    #                 {"score": 0.333, "label": ANY(str)},
-    #             ],
-    #             [
-    #                 {"score": 0.333, "label": ANY(str)},
-    #                 {"score": 0.333, "label": ANY(str)},
-    #                 {"score": 0.333, "label": ANY(str)},
-    #             ],
-    #             [
-    #                 {"score": 0.333, "label": ANY(str)},
-    #                 {"score": 0.333, "label": ANY(str)},
-    #                 {"score": 0.333, "label": ANY(str)},
-    #             ],
-    #         ],
-    #     )
-
-    # @slow
+    @slow
     @require_torch
     def test_large_model_pt(self):
         audio_classifier = pipeline(

From d90a9f289670b2564a4b583c74b3455327fc8d2a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 14:54:26 +0000
Subject: [PATCH 119/197] ignore docstyle on image transform

---
 src/transformers/image_transforms.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 9c1b087c7331..7269c31078af 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -710,8 +710,8 @@ def convert_to_rgb(image: ImageInput) -> ImageInput:
     return image
 
 
-def bilinear_interpolation(image: np.ndarray, y:float, x:float):
-    # fmt: off
+def bilinear_interpolation(image: np.ndarray, y: float, x: float):
+    # docstyle-ignore
     """
     A bilinear interpolation of the estimated values of the `image` at non integer indexes `y` and `x`.
     
@@ -743,7 +743,6 @@ def bilinear_interpolation(image: np.ndarray, y:float, x:float):
     For more details about bilinear interplation, see [on the wikipedia
     page](https://en.wikipedia.org/wiki/Bilinear_interpolation)
     """
-    # fmt: on
     height = image.shape[0]
     width = image.shape[1]
 
@@ -767,15 +766,15 @@ def bilinear_interpolation(image: np.ndarray, y:float, x:float):
     return new_pixel
 
 
-def np_bilinear_resize(image:np.ndarray, new_height:int, new_width:int):
+def np_bilinear_resize(image: np.ndarray, new_height: int, new_width: int):
     """
     Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)`
-    this is the equivalent of the `torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])`. This function is not optimal in
-    terms of performances, but has the same results as `torchvision` counterpart when called with
-    the default `bilinear` interpolation.
+    this is the equivalent of the `torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])`. This
+    function is not optimal in terms of performances, but has the same results as `torchvision` counterpart when called
+    with the default `bilinear` interpolation.
     """
     # new_image = [[0 for _ in range(new_width)] for _ in range(new_height)]
-    new_image = np.zeros((new_height, new_width), image.dtype) 
+    new_image = np.zeros((new_height, new_width), image.dtype)
 
     orig_height = image.shape[0]
     orig_width = image.shape[1]

From ff25a450d83b08c4b25acdfa7e616d91ce412393 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 15:41:42 +0000
Subject: [PATCH 120/197] add conversion script

---
 .../convert_clap_original_pytorch_to_hf.py    | 168 ++++++++----------
 src/transformers/models/clap/modeling_clap.py |  10 +-
 2 files changed, 78 insertions(+), 100 deletions(-)

diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
index b1be783c9814..258d3870eb73 100644
--- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+++ b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
@@ -14,128 +14,101 @@
 # limitations under the License.
 
 import argparse
+import re
 
 import torch
-from clap import load
+from CLAP import create_model
 
-from transformers import CLAPConfig, CLAPModel
+from transformers import AutoFeatureExtractor, CLAPConfig, CLAPModel
 
 
-def copy_attn_layer(hf_attn_layer, pt_attn_layer):
-    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
-    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
+KEYS_TO_MODIFY_MAPPING = {
+    "text_branch": "text_model",
+    "audio_branch": "audio_model.audio_encoder",
+    "attn": "attention.self",
+    "self.proj": "output.dense",
+    "attention.self_mask": "attn_mask",
+    "mlp.fc1": "intermediate.dense",
+    "mlp.fc2": "output.dense",
+    "norm1": "layernorm_before",
+    "norm2": "layernorm_after",
+    # "bn0": "batch_norm",
+}
 
-    out_proj_weights = pt_attn_layer.out_proj.weight
-    out_proj_bias = pt_attn_layer.out_proj.bias
+processor = AutoFeatureExtractor.from_pretrained("ArthurZ/clap", truncation="rand_trunc")
 
-    hf_attn_layer.q_proj.weight.data = q_proj
-    hf_attn_layer.q_proj.bias.data = q_proj_bias
 
-    hf_attn_layer.k_proj.weight.data = k_proj
-    hf_attn_layer.k_proj.bias.data = k_proj_bias
+def init_clap(checkpoint_path, enable_fusion=False):
+    model, model_cfg = create_model(
+        "HTSAT-tiny",
+        "roberta",
+        checkpoint_path,
+        precision="fp32",
+        device="cuda:0" if torch.cuda.is_available() else "cpu",
+        enable_fusion=enable_fusion,
+        fusion_type="aff_2d" if enable_fusion else None,
+    )
+    return model, model_cfg
 
-    hf_attn_layer.v_proj.weight.data = v_proj
-    hf_attn_layer.v_proj.bias.data = v_proj_bias
 
-    hf_attn_layer.out_proj.weight = out_proj_weights
-    hf_attn_layer.out_proj.bias = out_proj_bias
+def rename_state_dict(state_dict):
+    model_state_dict = {}
 
+    sequential_layers_pattern = r".*sequential.(\d+).*"
+    text_projection_pattern = r".*_projection.(\d+).*"
 
-def copy_mlp(hf_mlp, pt_mlp):
-    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
-    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
+    for key, value in state_dict.items():
+        # check if any key needs to be modified
+        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
 
+        if re.match(sequential_layers_pattern, key):
+            # replace sequential layers with list
+            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
 
-def copy_linear(hf_linear, pt_linear):
-    hf_linear.weight = pt_linear.weight
-    hf_linear.bias = pt_linear.bias
+            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
+        elif re.match(text_projection_pattern, key):
+            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
 
+            # Because in CLAP they use `nn.Sequential`...
+            transformers_projection_layer = 1 if projecton_layer == 0 else 2
 
-def copy_layer(hf_layer, pt_layer):
-    # copy layer norms
-    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
-    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
+            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
 
-    # copy MLP
-    copy_mlp(hf_layer.mlp, pt_layer.mlp)
+        if "audio" and "qkv" in key:
+            # split qkv into query key and value
+            mixed_qkv = value
+            qkv_dim = mixed_qkv.size(0) // 3
 
-    # copy attn
-    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
 
+            model_state_dict[key.replace("qkv", "query")] = query_layer
+            model_state_dict[key.replace("qkv", "key")] = key_layer
+            model_state_dict[key.replace("qkv", "value")] = value_layer
+        else:
+            model_state_dict[key] = value
 
-def copy_layers(hf_layers, pt_layers):
-    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
-        copy_layer(hf_layer, pt_layer)
+    return model_state_dict
 
 
-def copy_encoder(hf_encoder, pt_model):
-    # copy  embeds
-    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
-    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
+def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, enable_fusion=False):
+    clap_model, clap_model_cfg = init_clap(checkpoint_path, enable_fusion=enable_fusion)
 
-    # copy layer norm
-    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
+    clap_model.eval()
+    state_dict = clap_model.state_dict()
+    state_dict = rename_state_dict(state_dict)
 
-    # copy hidden layers
-    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
+    transformers_config = CLAPConfig()
+    transformers_config.audio_config.enable_fusion = enable_fusion
+    model = CLAPModel(transformers_config)
 
+    model.load_state_dict(state_dict, strict=False)
 
-def copy_text_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.text_projection.weight.data = pt_model.text_projection.data.T
-
-    # copy text encoder
-    copy_encoder(hf_model.text_model, pt_model)
-
-
-def copy_vison_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T
-
-    # copy layer norms
-    copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
-    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
-
-    # copy embeds
-    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
-    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
-    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
-
-    # copy encoder
-    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
-
-
-@torch.no_grad()
-def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = CLAPConfig.from_pretrained(config_path)
-    else:
-        config = CLAPConfig(projection_dim=512, text_config={}, vision_config={})
-
-    hf_model = CLAPModel(config).eval()
-
-    pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
-    pt_model = pt_model.eval()
-
-    copy_text_model_and_projection(hf_model, pt_model)
-    copy_vison_model_and_projection(hf_model, pt_model)
-    hf_model.logit_scale = pt_model.logit_scale
-
-    input_ids = torch.arange(0, 77).unsqueeze(0)
-    pixel_values = torch.randn(1, 3, 224, 224)
-
-    hf_logits_per_image, hf_logits_per_text = hf_model(
-        input_ids=input_ids, pixel_values=pixel_values, return_dict=True
-    )[1:3]
-    pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
-
-    assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
-    assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
+    model.save_pretrained(pytorch_dump_folder_path)
+    transformers_config.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
@@ -143,6 +116,7 @@ def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_pa
     parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
     parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
     parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument("--enable_fusion", action="store_true", help="Whether to enable fusion or not")
     args = parser.parse_args()
 
-    convert_clap_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
+    convert_clap_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.enable_fusion)
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index e0900c109730..653e7f381cef 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -43,10 +43,11 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "laion-ai/clap-htst-unfused-base"
+_CHECKPOINT_FOR_DOC = "laion-ai/clap-htsat-fused"
 
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "laion-ai/clap-htst-unfused-base",
+    "laion-ai/clap-htsat-fused",
+    "laion-ai/clap-htsat-unfused",
     # See all clap models at https://huggingface.co/models?filter=clap
 ]
 
@@ -354,7 +355,10 @@ def forward(self, hidden_states, residual):
 
 
 class CLAPAudioPatchEmbed(nn.Module):
-    """2D Image to Patch Embedding"""
+    """
+    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
+    Transformer block.
+    """
 
     def __init__(self, config: CLAPAudioConfig):
         super().__init__()

From a6c3cc274b772bfa72d2b455ca4e23473566ecaa Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 15:44:15 +0000
Subject: [PATCH 121/197] remove the `clap` indx in favor of `CLAP`

---
 README.md                | 1 -
 README_es.md             | 1 -
 README_hd.md             | 1 -
 README_ja.md             | 1 -
 README_ko.md             | 1 -
 README_zh-hans.md        | 1 -
 README_zh-hant.md        | 1 -
 docs/source/en/index.mdx | 1 -
 8 files changed, 8 deletions(-)

diff --git a/README.md b/README.md
index abe0cb728794..bf43005f0294 100644
--- a/README.md
+++ b/README.md
@@ -295,7 +295,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_es.md b/README_es.md
index 4ea342470fd3..4cefc4a5a411 100644
--- a/README_es.md
+++ b/README_es.md
@@ -288,7 +288,6 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_hd.md b/README_hd.md
index 9721fff14aff..7dcf200ae9a3 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -260,7 +260,6 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा।
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा।
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (सेल्सफोर्स से) साथ में पेपर [प्रोग्राम सिंथेसिस के लिए एक संवादात्मक प्रतिमान](https://arxiv.org/abs/2203.13474) एरिक निजकैंप, बो पैंग, हिरोआकी हयाशी, लिफू तू, हुआन वांग, यिंगबो झोउ, सिल्वियो सावरेस, कैमिंग जिओंग रिलीज।
diff --git a/README_ja.md b/README_ja.md
index cbe060de91c7..4fe47d6854fe 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -322,7 +322,6 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)
diff --git a/README_ko.md b/README_ko.md
index 949857d07177..6a724ab538d5 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -237,7 +237,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce 에서) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 의 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 752daf27f1b8..95117cb7f976 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -261,7 +261,6 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 20113ca06ebb..c6bef27f2150 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -273,7 +273,6 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 535a819f779a..d810fe98dbda 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -74,7 +74,6 @@ The documentation is organized into five sections:
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[clap](model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.

From 01bb8707213778c77d94c3de179dc0445c36b64d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 15:51:19 +0000
Subject: [PATCH 122/197] update __init

---
 src/transformers/models/clap/__init__.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index c60aed8857db..c536021636d7 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -54,12 +50,10 @@
         CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CLAPAudioConfig,
         CLAPConfig,
-        CLAPOnnxConfig,
         CLAPTextConfig,
     )
     from .feature_extraction_clap import CLAPFeatureExtractor
     from .processing_clap import CLAPProcessor
-    from .tokenization_clap import CLAPTokenizer
 
     try:
         if not is_torch_available():

From 1b2c5144d8370923ca800783d5b96ee76161682d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 9 Feb 2023 16:13:47 +0000
Subject: [PATCH 123/197] nits

---
 src/transformers/models/clap/__init__.py      |  18 +-
 .../zero_shot_audio_classification.py         |   2 +-
 .../clap/test_feature_extraction_clap.py      |  11 +-
 tests/models/clap/test_modeling_clap.py       | 168 ++----------------
 tests/models/clap/test_processor_clap.py      |  11 +-
 utils/documentation_tests.txt                 |   2 +
 6 files changed, 18 insertions(+), 194 deletions(-)

diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index c536021636d7..40bd1de92782 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -46,14 +46,6 @@
     ]
 
 if TYPE_CHECKING:
-    from .configuration_clap import (
-        CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        CLAPAudioConfig,
-        CLAPConfig,
-        CLAPTextConfig,
-    )
-    from .feature_extraction_clap import CLAPFeatureExtractor
-    from .processing_clap import CLAPProcessor
 
     try:
         if not is_torch_available():
@@ -61,15 +53,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_clap import (
-            CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
-            CLAPAudioModel,
-            CLAPAudioModelWithProjection,
-            CLAPModel,
-            CLAPPreTrainedModel,
-            CLAPTextModel,
-            CLAPTextModelWithProjection,
-        )
+        pass
 
 else:
     import sys
diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
index 7da8766c69ca..4bd927f9346a 100644
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -119,7 +119,7 @@ def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is
         if not isinstance(audio, np.ndarray):
             raise ValueError("We expect a numpy ndarray as input")
         if len(audio.shape) != 1:
-            raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
+            raise ValueError("We expect a single channel audio input for ZeroShotAudioClassificationPipeline")
 
         n = len(candidate_labels)
         for i, candidate_label in enumerate(candidate_labels):
diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
index 86dc5848934b..50f86d046e0a 100644
--- a/tests/models/clap/test_feature_extraction_clap.py
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 HuggingFace Inc.
+# Copyright 2023 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,16 +20,13 @@
 
 import numpy as np
 
-from transformers import is_speech_available
+from transformers import CLAPFeatureExtractor
 from transformers.testing_utils import require_torch, require_torchaudio
 from transformers.utils.import_utils import is_torch_available
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
 
 
-if is_speech_available():
-    from transformers import CLAPFeatureExtractor
-
 if is_torch_available():
     import torch
 
@@ -114,7 +111,7 @@ def _flatten(list_of_lists):
 @require_torchaudio
 # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest with Whisper->CLAP
 class CLAPFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = CLAPFeatureExtractor if is_speech_available() else None
+    feature_extraction_class = CLAPFeatureExtractor
 
     def setUp(self):
         self.feat_extract_tester = CLAPFeatureExtractionTester(self)
@@ -268,5 +265,3 @@ def integration_test_rand_trunc(self):
                 input_speech, return_tensors="pt", truncation="rand_trunc", padding=padding
             ).input_features
             self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_VALUES, atol=1e-4))
-
-        # TODO test fusion with a longer audio
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 8015e279a124..637c1a65592c 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -21,18 +21,11 @@
 import unittest
 
 import numpy as np
-import requests
-
-import transformers
-from transformers import CLAPAudioConfig, CLAPConfig, CLAPTextConfig
-from transformers.testing_utils import (
-    is_flax_available,
-    is_pt_flax_cross_test,
-    require_torch,
-    slow,
-    torch_device,
-)
-from transformers.utils import is_torch_available, is_vision_available
+from datasets import load_dataset
+
+from transformers import CLAPAudioConfig, CLAPConfig, CLAPProcessor, CLAPTextConfig
+from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.utils import is_torch_available
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
@@ -52,26 +45,12 @@
         CLAPAudioModel,
         CLAPAudioModelWithProjection,
         CLAPModel,
-        CLAPProcessor,
         CLAPTextModel,
         CLAPTextModelWithProjection,
     )
     from transformers.models.clap.modeling_clap import CLAP_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
-if is_vision_available():
-    from PIL import Image
-
-
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
-
 class CLAPAudioModelTester:
     def __init__(
         self,
@@ -423,13 +402,15 @@ def test_model_with_projection(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
 
+    @unittest.skip(reason="CLAPTextModel does not output any loss term in the forward pass")
     def test_training(self):
         pass
 
+    @unittest.skip(reason="CLAPTextModel does not output any loss term in the forward pass")
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(reason="CLAP does not use inputs_embeds")
+    @unittest.skip(reason="CLAPTextModel does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
@@ -468,8 +449,8 @@ def __init__(self, parent, text_kwargs=None, audio_kwargs=None, is_training=True
         self.is_training = is_training
 
     def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        audio_config, input_features = self.audio_model_tester.prepare_config_and_inputs()
+        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        _, input_features = self.audio_model_tester.prepare_config_and_inputs()
 
         config = self.get_config()
 
@@ -625,124 +606,6 @@ def test_load_audio_text_config(self):
             text_config = CLAPTextConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
 
-    # overwrite from common since FlaxCLAPModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    return
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
-
-    # overwrite from common since FlaxCLAPModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load corresponding PyTorch class
-                pt_model = model_class(config).eval()
-
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -750,13 +613,6 @@ def test_model_from_pretrained(self):
             self.assertIsNotNone(model)
 
 
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
 @slow
 @require_torch
 class CLAPModelIntegrationTest(unittest.TestCase):
@@ -769,8 +625,6 @@ def test_integration_unfused(self):
             "repeat": 0.0023,
         }
 
-        from datasets import load_dataset
-
         librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         audio_sample = librispeech_dummy[-1]
 
@@ -798,8 +652,6 @@ def test_integration_fused(self):
             "pad": -0.000379,
         }
 
-        from datasets import load_dataset
-
         librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         audio_sample = librispeech_dummy[-1]
 
diff --git a/tests/models/clap/test_processor_clap.py b/tests/models/clap/test_processor_clap.py
index 63f8cfbc15fd..5e035f846e1e 100644
--- a/tests/models/clap/test_processor_clap.py
+++ b/tests/models/clap/test_processor_clap.py
@@ -16,21 +16,12 @@
 import tempfile
 import unittest
 
-from transformers import RobertaTokenizer, RobertaTokenizerFast
+from transformers import CLAPFeatureExtractor, CLAPProcessor, RobertaTokenizer, RobertaTokenizerFast
 from transformers.testing_utils import require_sentencepiece, require_torchaudio
-from transformers.utils import is_torchvision_available
 
 from .test_feature_extraction_clap import floats_list
 
 
-if is_torchvision_available():
-    from transformers import CLAPFeatureExtractor, CLAPProcessor
-
-
-TRANSCRIBE = 50358
-NOTIMESTAMPS = 50362
-
-
 @require_torchaudio
 @require_sentencepiece
 class CLAPProcessorTest(unittest.TestCase):
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 40349c10aaf0..962ac403aea7 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -40,6 +40,8 @@ src/transformers/models/bloom/configuration_bloom.py
 src/transformers/models/camembert/configuration_camembert.py
 src/transformers/models/canine/configuration_canine.py
 src/transformers/models/canine/modeling_canine.py
+src/transformers/models/clap/configuration_clap.py
+src/transformers/models/clap/modeling_clap.py
 src/transformers/models/clip/configuration_clip.py
 src/transformers/models/clipseg/modeling_clipseg.py
 src/transformers/models/codegen/configuration_codegen.py

From c805bae96a15d92231bec4335868d01403fce6e0 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Thu, 9 Feb 2023 17:19:46 +0100
Subject: [PATCH 124/197] Update src/transformers/pipelines/__init__.py

---
 src/transformers/pipelines/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 1a6002f27bd4..27b18b007d79 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -311,7 +311,6 @@
         "default": {
             "model": {
                 "pt": ("laion-ai/clap-hsat-tiny", "f4881ba"),
-                "tf": ("laion-ai/clip-hsat-tiny", "f4881ba"),
             }
         },
         "type": "multimodal",

From 640f6f909bcfaf41df22b1fdff46337ff062ba7a Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 16:28:28 +0000
Subject: [PATCH 125/197] fix bug

---
 src/transformers/models/clap/modeling_clap.py | 4 ++--
 tests/models/clap/test_modeling_clap.py       | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 653e7f381cef..8a0890d832a4 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -519,7 +519,7 @@ def forward(
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
-        batch_size, dim, num_channels = hidden_states.shape
+        batch_size, dim, _ = hidden_states.shape
         mixed_query_layer = self.query(hidden_states)
 
         key_layer = self.transpose_for_scores(self.key(hidden_states))
@@ -1009,7 +1009,7 @@ def forward(
         is_longer_list_idx = None
         if self.enable_fusion:
             is_longer_list = is_longer.to(input_features.device)
-            is_longer_list_idx = torch.where(is_longer_list == 0)[0]
+            is_longer_list_idx = torch.where(is_longer_list == 1)[0]
 
         hidden_states = self.reshape_mel2img(normalixed_input_features)
 
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 8015e279a124..f250a8748a09 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -812,7 +812,6 @@ def test_integration_fused(self):
             inputs = processor(
                 audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding, truncation="fusion"
             ).to(torch_device)
-            inputs["is_longer"] = torch.tensor([False])
 
             audio_embed = model.get_audio_features(**inputs)
             expected_mean = EXPECTED_MEANS_FUSED[padding]

From 177a7050938dbf85bd5b85ffb962d168e8908d0b Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 16:40:53 +0000
Subject: [PATCH 126/197] clarifiy config

---
 src/transformers/models/clap/__init__.py         |  1 -
 .../pipelines/zero_shot_audio_classification.py  | 16 +++++++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index 40bd1de92782..673e2f2f0cac 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -46,7 +46,6 @@
     ]
 
 if TYPE_CHECKING:
-
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
index 4bd927f9346a..072606acf1fd 100644
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -29,19 +29,17 @@ class ZeroShotAudioClassificationPipeline(ChunkPipeline):
 
     ```python
     >>> from transformers import pipeline
+    >>> from datasets import load_dataset
 
-    >>> classifier = pipeline(model="laion-ai/clap-hsat-tiny")
-    >>> classifier(
-    ...     "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
-    ...     candidate_labels=["animals", "humans", "landscape"],
-    ... )
-    [{'score': 0.965, 'label': 'animals'}, {'score': 0.03, 'label': 'humans'}, {'score': 0.005, 'label': 'landscape'}]
+    >>> dataset = load_dataset("ashraq/esc50")
+    >>> audio = next(iter(dataset["train"]["audio"]))["array"]
 
+    >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion-ai/clap-hsat-tiny")
     >>> classifier(
-    ...     "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
-    ...     candidate_labels=["black and white", "photorealist", "painting"],
+    ...     audio,
+    ...     candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"],
     ... )
-    [{'score': 0.996, 'label': 'black and white'}, {'score': 0.003, 'label': 'photorealist'}, {'score': 0.0, 'label': 'painting'}]
+    [{'score': 0.999727189540863, 'label': 'Sound of a dog'}, {'score': 0.0002727957325987518, 'label': 'Sound of vaccum cleaner'}]
     ```
 
     Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

From 57fd58c0382f14732c90181c03d221c007a7ec6b Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 16:44:19 +0000
Subject: [PATCH 127/197] fix copy

---
 src/transformers/models/clap/modeling_clap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 8a0890d832a4..1c9e31dcceee 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -519,7 +519,7 @@ def forward(
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
-        batch_size, dim, _ = hidden_states.shape
+        batch_size, dim, num_channels = hidden_states.shape
         mixed_query_layer = self.query(hidden_states)
 
         key_layer = self.transpose_for_scores(self.key(hidden_states))

From 0874aba7a5c1fda3237a3361d113a9f555e53130 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 16:51:06 +0000
Subject: [PATCH 128/197] fix init

---
 src/transformers/models/clap/__init__.py | 25 ++++++++++++++++++++++--
 src/transformers/pipelines/__init__.py   |  3 ++-
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index 673e2f2f0cac..36a3ec94237f 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -1,3 +1,7 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
 # Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,7 +25,6 @@
         "CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "CLAPAudioConfig",
         "CLAPConfig",
-        "CLAPOnnxConfig",
         "CLAPTextConfig",
     ],
     "feature_extraction_clap": ["CLAPFeatureExtractor"],
@@ -46,13 +49,31 @@
     ]
 
 if TYPE_CHECKING:
+    from .configuration_clap import (
+        CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CLAPAudioConfig,
+        CLAPConfig,
+        CLAPTextConfig,
+    )
+    from .feature_extraction_clap import CLAPFeatureExtractor
+    from .processing_clap import CLAPProcessor
+    from .tokenization_clap import CLAPTokenizer
+
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         pass
     else:
-        pass
+        from .modeling_clap import (
+            CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CLAPAudioModel,
+            CLAPAudioModelWithProjection,
+            CLAPModel,
+            CLAPPreTrainedModel,
+            CLAPTextModel,
+            CLAPTextModelWithProjection,
+        )
 
 else:
     import sys
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 23cf6da2da06..b91ad6c2a3b7 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -78,10 +78,11 @@
 )
 from .video_classification import VideoClassificationPipeline
 from .visual_question_answering import VisualQuestionAnsweringPipeline
+from .zero_shot_audio_classification import ZeroShotAudioClassificationPipeline
 from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
 from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
 from .zero_shot_object_detection import ZeroShotObjectDetectionPipeline
-from .zero_shot_audio_classification import ZeroShotAudioClassificationPipeline
+
 
 if is_tf_available():
     import tensorflow as tf

From 31d3204130ed399e95afd761ddfce3125055a526 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 10 Feb 2023 09:51:55 +0100
Subject: [PATCH 129/197] Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/feature_extraction_sequence_utils.py | 2 +-
 src/transformers/models/clap/configuration_clap.py    | 2 +-
 src/transformers/models/clap/modeling_clap.py         | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 31306ef9e135..3e52f56326ed 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -522,7 +522,7 @@ def get_mel_filter_banks(
                 Scale to use: `htk` or `slaney`. (Default: `htk`)
 
         Returns:
-            Tensor: Triangular filter banks (fb matrix) of size (`n_freqs`, `n_mels`) meaning number of frequencies to
+            `numpy.ndarray`: Triangular filter banks (fb matrix) of size (`n_freqs`, `n_mels`) meaning number of frequencies to
             highlight/apply to x the number of filterbanks. Each column is a filterbank so that assuming there is a
             matrix A of size (..., `n_freqs`), the applied result would be `A * melscale_fbanks(A.size(-1), ...)`.
 
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 5a733afc0d8a..bc8b6b4b96cd 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -43,7 +43,7 @@ class CLAPTextConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the RoBERTa model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`CLAPTextModel`] or [`TFCLAPTextModel`].
+            `inputs_ids` passed when calling [`CLAPTextModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 1c9e31dcceee..116ac86cdfd1 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -297,7 +297,7 @@ class CLAPDropPath(nn.Module):
     """
 
     def __init__(self, drop_prob=None):
-        super(CLAPDropPath, self).__init__()
+        super().__init__()
         self.drop_prob = drop_prob
 
     def forward(self, hidden_states):
@@ -321,7 +321,7 @@ class CLAPAudioAFFBlock(nn.Module):
     """
 
     def __init__(self, config: CLAPAudioConfig):
-        super(CLAPAudioAFFBlock, self).__init__()
+        super().__init__()
         channels = config.patch_embeds_hidden_size
         downsize_ratio = config.aff_block_r
         inter_channels = int(channels // downsize_ratio)

From 948eb0ae089a96605bded8ebcee9cb41131d63b5 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 10 Feb 2023 08:57:43 +0000
Subject: [PATCH 130/197] fix model output

---
 src/transformers/models/clap/modeling_clap.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 116ac86cdfd1..19f249f064fd 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -207,9 +207,15 @@ class CLAPAudioModelOutput(ModelOutput):
         embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
     """
 
     framewise_output: torch.FloatTensor = None
@@ -237,9 +243,15 @@ class CLAPAudioModelOutputWithProjection(ModelOutput):
         embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
     """
 
     audio_embeds: Optional[torch.FloatTensor] = None

From f75c02a4a48dfc5a410c08151e0548f234f78385 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 10 Feb 2023 08:59:58 +0000
Subject: [PATCH 131/197] fix comment

---
 src/transformers/models/clap/modeling_clap.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 19f249f064fd..3261c5271328 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1073,7 +1073,7 @@ def custom_forward(*inputs):
 
             if output_hidden_states and output_hidden_states_before_downsampling:
                 batch_size, _, hidden_size = hidden_states_before_downsampling.shape
-                # rearrange b (h w) c -> b c h w
+                # rearrange batch_size (height width) channels -> batch_size channel height width
                 # here we use the original (not downsampled) height and width
                 reshaped_hidden_state = hidden_states_before_downsampling.view(
                     batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
@@ -1083,7 +1083,7 @@ def custom_forward(*inputs):
                 all_reshaped_hidden_states += (reshaped_hidden_state,)
             elif output_hidden_states and not output_hidden_states_before_downsampling:
                 batch_size, _, hidden_size = hidden_states.shape
-                # rearrange b (h w) c -> b c h w
+                # rearrange batch_size (height width) channels -> batch_size channel height width
                 reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
                 reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
                 all_hidden_states += (hidden_states,)

From 7d7a0000e53dc3a4f05d2301004723c199b62bec Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 10 Feb 2023 09:39:00 +0000
Subject: [PATCH 132/197] make fixup

---
 src/transformers/feature_extraction_sequence_utils.py | 7 ++++---
 src/transformers/models/clap/modeling_clap.py         | 6 ++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 3e52f56326ed..72f76af53b81 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -522,9 +522,10 @@ def get_mel_filter_banks(
                 Scale to use: `htk` or `slaney`. (Default: `htk`)
 
         Returns:
-            `numpy.ndarray`: Triangular filter banks (fb matrix) of size (`n_freqs`, `n_mels`) meaning number of frequencies to
-            highlight/apply to x the number of filterbanks. Each column is a filterbank so that assuming there is a
-            matrix A of size (..., `n_freqs`), the applied result would be `A * melscale_fbanks(A.size(-1), ...)`.
+            `numpy.ndarray`: Triangular filter banks (fb matrix) of size (`n_freqs`, `n_mels`) meaning number of
+            frequencies to highlight/apply to x the number of filterbanks. Each column is a filterbank so that assuming
+            there is a matrix A of size (..., `n_freqs`), the applied result would be `A * melscale_fbanks(A.size(-1),
+            ...)`.
 
         """
 
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 3261c5271328..2eaa244d4805 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -207,7 +207,8 @@ class CLAPAudioModelOutput(ModelOutput):
         embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
@@ -243,7 +244,8 @@ class CLAPAudioModelOutputWithProjection(ModelOutput):
         embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.

From 63d073a154be42afbd1fb6cd1e9a60721de66f9b Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 10 Feb 2023 09:39:07 +0000
Subject: [PATCH 133/197] make fixup

---
 src/transformers/models/clap/modeling_clap.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 2eaa244d4805..98544c49abd9 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -155,13 +155,6 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
     return nn.functional.cross_entropy(logits, labels)
 
 
-# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->clap, image->audio
-def clap_loss(similarity: torch.Tensor) -> torch.Tensor:
-    caption_loss = contrastive_loss(similarity)
-    audio_loss = contrastive_loss(similarity.t())
-    return (caption_loss + audio_loss) / 2.0
-
-
 @dataclass
 # Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->CLAP
 class CLAPTextModelOutput(ModelOutput):
@@ -2318,7 +2311,9 @@ def forward(
 
         loss = None
         if return_loss:
-            loss = clap_loss(logits_per_text)
+            caption_loss = contrastive_loss(logits_per_text)
+            audio_loss = contrastive_loss(logits_per_text.t())
+            loss = (caption_loss + audio_loss) / 2.0
 
         if not return_dict:
             output = (logits_per_audio, logits_per_text, text_embeds, audio_embeds, text_outputs, audio_outputs)

From 3bea4277240ff5cbc761859c37390b3278ccbf27 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 10 Feb 2023 10:57:18 +0000
Subject: [PATCH 134/197] rename to `Clap`

---
 README.md                                     |  2 +-
 README_es.md                                  |  2 +-
 README_hd.md                                  |  2 +-
 README_ja.md                                  |  2 +-
 README_ko.md                                  |  2 +-
 README_zh-hans.md                             |  2 +-
 README_zh-hant.md                             |  2 +-
 docs/source/en/_toctree.yml                   |  2 +-
 docs/source/en/index.mdx                      |  4 +--
 docs/source/en/model_doc/clap.mdx             |  4 +--
 .../models/auto/configuration_auto.py         |  2 +-
 .../models/clap/configuration_clap.py         | 12 ++++-----
 .../convert_clap_original_pytorch_to_hf.py    |  4 +--
 .../models/clap/feature_extraction_clap.py    |  6 ++---
 src/transformers/models/clap/modeling_clap.py | 26 +++++++++----------
 .../models/clap/processing_clap.py            |  4 +--
 .../zero_shot_audio_classification.py         |  2 +-
 .../clap/test_feature_extraction_clap.py      |  4 +--
 tests/models/clap/test_modeling_clap.py       |  8 +++---
 ...ipelines_zero_shot_audio_classification.py |  2 +-
 20 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 7244b380f680..b55d5023b665 100644
--- a/README.md
+++ b/README.md
@@ -295,7 +295,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_es.md b/README_es.md
index e046ffb4a073..9201a11110fb 100644
--- a/README_es.md
+++ b/README_es.md
@@ -288,7 +288,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_hd.md b/README_hd.md
index 8959088672b1..64bb763ca077 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -260,7 +260,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (इनरिया/फेसबुक/सोरबोन से) साथ में कागज [CamemBERT: एक टेस्टी फ्रेंच लैंग्वेज मॉडल](https:// arxiv.org/abs/1911.03894) लुई मार्टिन*, बेंजामिन मुलर*, पेड्रो जेवियर ऑर्टिज़ सुआरेज़*, योआन ड्यूपॉन्ट, लॉरेंट रोमरी, एरिक विलेमोन्टे डे ला क्लर्जरी, जैमे सेडाह और बेनोइट सगोट द्वारा।
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा।
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
+1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा।
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (सेल्सफोर्स से) साथ में पेपर [प्रोग्राम सिंथेसिस के लिए एक संवादात्मक प्रतिमान](https://arxiv.org/abs/2203.13474) एरिक निजकैंप, बो पैंग, हिरोआकी हयाशी, लिफू तू, हुआन वांग, यिंगबो झोउ, सिल्वियो सावरेस, कैमिंग जिओंग रिलीज।
diff --git a/README_ja.md b/README_ja.md
index 0a56df93c138..ff745247d7b8 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -322,7 +322,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
+1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)
diff --git a/README_ko.md b/README_ko.md
index ee2f4852af89..83af0bc684d6 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -237,7 +237,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne 에서) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 의 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 논문과 함께 발표했습니다.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
+1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce 에서) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 의 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 23543027caba..a3ed275f5fe2 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -261,7 +261,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
+1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 860e6e008bbf..84da51d06dad 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -273,7 +273,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 1c85e7a96af0..23a506853e23 100755
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -484,7 +484,7 @@
       - local: model_doc/audio-spectrogram-transformer
         title: Audio Spectrogram Transformer
       - local: model_doc/clap
-        title: CLAP
+        title: Clap
       - local: model_doc/hubert
         title: Hubert
       - local: model_doc/mctct
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 7b18da09a1df..2087140f281f 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -74,7 +74,7 @@ The documentation is organized into five sections:
 1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[Clap](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
@@ -261,7 +261,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             CLAP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             Clap              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/clap.mdx b/docs/source/en/model_doc/clap.mdx
index 7a756e3375ee..fedaad39bcda 100644
--- a/docs/source/en/model_doc/clap.mdx
+++ b/docs/source/en/model_doc/clap.mdx
@@ -17,14 +17,14 @@ specific language governing permissions and limitations under the License.
 The clap model was proposed in [Large Scale Constrastive Laungaue-Audio pretraining with
 feature fusion and keyword-to-caption augmentation](https://arxiv.org/pdf/2211.06687.pdf) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 
-The CLAP model uses a SWINTransformer on the input fused mel spectrogram as the audio encoder, and a ROBerta model for the text emcoder.
+The Clap model uses a SWINTransformer on the input fused mel spectrogram as the audio encoder, and a ROBerta model for the text emcoder.
 
 The abstract from the paper is the following:
 
 *Contrastive learning has shown remarkable success in the field of multimodal representation learning. In this paper, we propose a pipeline of contrastive language-audio pretraining to develop an audio representation by combining audio data with natural language descriptions. To accomplish this target, we first release LAION-Audio-630K, a large collection of 633,526 audio-text pairs from different data sources. Second, we construct a contrastive language-audio pretraining model by considering different audio encoders and text encoders. We incorporate the feature fusion mechanism and keyword-to-caption augmentation into the model design to further enable the model to process audio inputs of variable lengths and enhance the performance. Third, we perform comprehensive experiments to evaluate our model across three tasks: text-to-audio retrieval, zero-shot audio classification, and supervised audio classification. The results demonstrate that our model achieves superior performance in text-to-audio retrieval task. In audio classification tasks, the model achieves state-of-the-art performance in the zeroshot setting and is able to obtain performance comparable to models' results in the non-zero-shot setting. LAION-Audio-6*
 
 This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArtZucker) .
-The original code can be found [here](https://github.com/LAION-AI/CLAP).
+The original code can be found [here](https://github.com/LAION-AI/Clap).
 
 
 ## CLAPConfig
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 770621ee618c..9e2fb9ab83af 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -381,7 +381,7 @@
         ("camembert", "CamemBERT"),
         ("canine", "CANINE"),
         ("chinese_clip", "Chinese-CLIP"),
-        ("clap", "CLAP"),
+        ("clap", "Clap"),
         ("clip", "CLIP"),
         ("clipseg", "CLIPSeg"),
         ("codegen", "CodeGen"),
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index bc8b6b4b96cd..f96fb3cb3142 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CLAP model configuration"""
+""" Clap model configuration"""
 
 import copy
 import os
@@ -164,8 +164,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 class CLAPAudioConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`CLAPAudioModel`]. It is used to instantiate a
-    CLAP audio encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the audio encoder of the CLAP
+    Clap audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of the Clap
     [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -331,8 +331,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 class CLAPConfig(PretrainedConfig):
     r"""
     [`CLAPConfig`] is the configuration class to store the configuration of a [`CLAPModel`]. It is used to instantiate
-    a CLAP model according to the specified arguments, defining the text model and audio model configs. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the CLAP
+    a Clap model according to the specified arguments, defining the text model and audio model configs. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Clap
     [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -346,7 +346,7 @@ class CLAPConfig(PretrainedConfig):
         projection_dim (`int`, *optional*, defaults to 512):
             Dimentionality of text and audio projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
-            The inital value of the *logit_scale* paramter. Default is used as per the original CLAP implementation.
+            The inital value of the *logit_scale* paramter. Default is used as per the original Clap implementation.
         fusion_num_hidden_layers (`int`, *optional*, defaults to 2):
             [description]
         projection_dim (`int`, *optional*, defaults to 512):
diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
index 258d3870eb73..62848f4615d0 100644
--- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+++ b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
@@ -17,7 +17,7 @@
 import re
 
 import torch
-from CLAP import create_model
+from Clap import create_model
 
 from transformers import AutoFeatureExtractor, CLAPConfig, CLAPModel
 
@@ -71,7 +71,7 @@ def rename_state_dict(state_dict):
         elif re.match(text_projection_pattern, key):
             projecton_layer = int(re.match(text_projection_pattern, key).group(1))
 
-            # Because in CLAP they use `nn.Sequential`...
+            # Because in Clap they use `nn.Sequential`...
             transformers_projection_layer = 1 if projecton_layer == 0 else 2
 
             key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index d12207650761..2611eee58c0e 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for CLAP."""
+"""Feature extractor class for Clap."""
 
 
 import copy
@@ -31,7 +31,7 @@
 
 class CLAPFeatureExtractor(SequenceFeatureExtractor):
     r"""
-    Constructs a CLAP feature extractor.
+    Constructs a Clap feature extractor.
 
     This feature extractor inherits from [`CLAPFeatureExtractor`] which contains most of the main methods. Users should
     refer to this superclass for more information regarding those methods.
@@ -151,7 +151,7 @@ def to_dict(self) -> Dict[str, Any]:
 
     def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
         """
-        Compute the log-Mel spectrogram of the provided `waveform` using the `hanning` window. In CLAP, two different
+        Compute the log-Mel spectrogram of the provided `waveform` using the `hanning` window. In Clap, two different
         banks of filters are used depending on the truncation pattern:
             - `self.mel_filters`: they correspond to the defaults parameters of `torchaduio` which can be obtained from
               calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 98544c49abd9..a017f19fa880 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch CLAP model."""
+""" PyTorch Clap model."""
 import collections
 import math
 from dataclasses import dataclass
@@ -52,7 +52,7 @@
 ]
 
 
-# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L176
+# Adapted from https://github.com/LAION-AI/Clap/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L176
 def do_mixup(hidden_states, mixup_lambda):
     """
     MIXUP is a data augmentation method, proposed by Hongyi Zhang et al on 25 Oct. 2017.
@@ -73,7 +73,7 @@ def do_mixup(hidden_states, mixup_lambda):
     return out
 
 
-# Adapted from: https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L191
+# Adapted from: https://github.com/LAION-AI/Clap/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L191
 def interpolate(hidden_states, ratio):
     """
     Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.
@@ -90,7 +90,7 @@ def interpolate(hidden_states, ratio):
     return upsampled
 
 
-# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L249
+# Adapted from https://github.com/LAION-AI/Clap/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L249
 def window_partition(hidden_states, window_size):
     """
     Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
@@ -111,7 +111,7 @@ def window_partition(hidden_states, window_size):
     return windows
 
 
-# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L263
+# Adapted from https://github.com/LAION-AI/Clap/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L263
 def window_reverse(windows, window_size, height, width):
     """
     Args:
@@ -156,7 +156,7 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
 
 
 @dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->CLAP
+# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Clap
 class CLAPTextModelOutput(ModelOutput):
     """
     Base class for text model's outputs that also contains a pooling of the last hidden states.
@@ -259,7 +259,7 @@ class CLAPAudioModelOutputWithProjection(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLAP, vision->audio, Vision->Audio, image->audio
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Clap, vision->audio, Vision->Audio, image->audio
 class CLAPOutput(ModelOutput):
     """
     Args:
@@ -321,10 +321,10 @@ def forward(self, hidden_states):
         return output
 
 
-# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133
+# Adapted from https://github.com/LAION-AI/Clap/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133
 class CLAPAudioAFFBlock(nn.Module):
     r"""
-    AFF Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement the 1D version.
+    AFF Block from Clap, since in Clap we are always in 2D mode, it is not needed to implement the 1D version.
     """
 
     def __init__(self, config: CLAPAudioConfig):
@@ -2182,7 +2182,7 @@ def get_text_features(
         >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
         ```"""
-        # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
+        # Use Clap model's config for some fields (if specified) instead of those of audio & text components.
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -2270,7 +2270,7 @@ def forward(
         >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
         >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
         ```"""
-        # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
+        # Use Clap model's config for some fields (if specified) instead of those of audio & text components.
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -2332,7 +2332,7 @@ def forward(
 
 @add_start_docstrings(
     """
-    CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
+    Clap Text Model with a projection layer on top (a linear layer on top of the pooled output).
     """,
     CLAP_START_DOCSTRING,
 )
@@ -2408,7 +2408,7 @@ def forward(
 
 @add_start_docstrings(
     """
-    CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output).
+    Clap Audio Model with a projection layer on top (a linear layer on top of the pooled output).
     """,
     CLAP_START_DOCSTRING,
 )
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index 1a38026896da..755147af2184 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Audio/Text processor class for CLAP
+Audio/Text processor class for Clap
 """
 
 from ...processing_utils import ProcessorMixin
@@ -22,7 +22,7 @@
 
 class CLAPProcessor(ProcessorMixin):
     r"""
-    Constructs a CLAP processor which wraps a CLAP feature extractor and a CLAP tokenizer into a single processor.
+    Constructs a Clap processor which wraps a Clap feature extractor and a Clap tokenizer into a single processor.
 
     [`CLAPProcessor`] offers all the functionalities of [`CLAPFeatureExtractor`] and [`CLAPTokenizerFast`]. See the
     [`~CLAPProcessor.__call__`] and [`~CLAPProcessor.decode`] for more information.
diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
index 072606acf1fd..aa310dd1c4f3 100644
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -134,7 +134,7 @@ def _forward(self, model_inputs):
         candidate_label = model_inputs.pop("candidate_label")
         outputs = self.model(**model_inputs)
 
-        # CLAP does crossproduct scoring by default, so we're only
+        # Clap does crossproduct scoring by default, so we're only
         # interested in the results where audio and text and in the same
         # batch position.
         diag = torch.diagonal
diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
index 50f86d046e0a..0b7f3896758b 100644
--- a/tests/models/clap/test_feature_extraction_clap.py
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -50,7 +50,7 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
 
 @require_torch
 @require_torchaudio
-# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester with Whisper->CLAP
+# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester with Whisper->Clap
 class CLAPFeatureExtractionTester(unittest.TestCase):
     def __init__(
         self,
@@ -109,7 +109,7 @@ def _flatten(list_of_lists):
 
 @require_torch
 @require_torchaudio
-# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest with Whisper->CLAP
+# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest with Whisper->Clap
 class CLAPFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
     feature_extraction_class = CLAPFeatureExtractor
 
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index c2a821f96264..5b5708f4a039 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch CLAP model. """
+""" Testing suite for the PyTorch Clap model. """
 
 
 import inspect
@@ -161,7 +161,7 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class CLAPAudioModelTest(ModelTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as CLAP does not use input_ids, inputs_embeds,
+    Here we also overwrite some of the tests of test_modeling_common.py, as Clap does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
@@ -516,7 +516,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_common_attributes(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for CLAP
+    # override as the `logit_scale` parameter initilization is different for Clap
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -554,7 +554,7 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             try:
                 input_ids = inputs_dict["input_ids"]
-                input_features = inputs_dict["input_features"]  # CLAP needs input_features
+                input_features = inputs_dict["input_features"]  # Clap needs input_features
                 traced_model = torch.jit.trace(model, (input_ids, input_features))
             except RuntimeError:
                 self.fail("Couldn't trace module.")
diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index 5acd07ffab4f..5b13deac16b4 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -25,7 +25,7 @@
 @require_torch
 class ZeroShotAudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
     # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping,
-    # and only CLAP would be there for now.
+    # and only Clap would be there for now.
     # model_mapping = {CLAPConfig: CLAPModel}
 
     @require_torch

From c20981dae9cba7ef7c6c3f38424b09e54896357d Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 10 Feb 2023 11:06:54 +0000
Subject: [PATCH 135/197] replace to `Clap`

---
 docs/source/en/model_doc/clap.mdx             |  40 +--
 src/transformers/__init__.py                  |  52 +--
 .../models/auto/configuration_auto.py         |   4 +-
 .../models/auto/feature_extraction_auto.py    |   2 +-
 src/transformers/models/auto/modeling_auto.py |   2 +-
 .../models/auto/processing_auto.py            |   2 +-
 src/transformers/models/clap/__init__.py      |  56 +--
 .../models/clap/configuration_clap.py         |  82 ++---
 .../convert_clap_original_pytorch_to_hf.py    |   6 +-
 .../models/clap/feature_extraction_clap.py    |   4 +-
 src/transformers/models/clap/modeling_clap.py | 340 +++++++++---------
 .../models/clap/processing_clap.py            |  20 +-
 .../zero_shot_audio_classification.py         |   2 +-
 src/transformers/utils/dummy_pt_objects.py    |  14 +-
 .../clap/test_feature_extraction_clap.py      |  14 +-
 tests/models/clap/test_modeling_clap.py       | 124 +++----
 tests/models/clap/test_processor_clap.py      |  26 +-
 ...ipelines_zero_shot_audio_classification.py |   2 +-
 utils/check_repo.py                           |   8 +-
 19 files changed, 400 insertions(+), 400 deletions(-)

diff --git a/docs/source/en/model_doc/clap.mdx b/docs/source/en/model_doc/clap.mdx
index fedaad39bcda..1e75135e572a 100644
--- a/docs/source/en/model_doc/clap.mdx
+++ b/docs/source/en/model_doc/clap.mdx
@@ -27,51 +27,51 @@ This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada)
 The original code can be found [here](https://github.com/LAION-AI/Clap).
 
 
-## CLAPConfig
+## ClapConfig
 
-[[autodoc]] CLAPConfig
+[[autodoc]] ClapConfig
     - from_text_audio_configs
 
-## CLAPTextConfig
+## ClapTextConfig
 
-[[autodoc]] CLAPTextConfig
+[[autodoc]] ClapTextConfig
 
-## CLAPAudioConfig
+## ClapAudioConfig
 
-[[autodoc]] CLAPAudioConfig
+[[autodoc]] ClapAudioConfig
 
-## CLAPFeatureExtractor
+## ClapFeatureExtractor
 
-[[autodoc]] CLAPFeatureExtractor
+[[autodoc]] ClapFeatureExtractor
 
-## CLAPProcessor
+## ClapProcessor
 
-[[autodoc]] CLAPProcessor
+[[autodoc]] ClapProcessor
 
-## CLAPModel
+## ClapModel
 
-[[autodoc]] CLAPModel
+[[autodoc]] ClapModel
     - forward
     - get_text_features
     - get_audio_features
 
-## CLAPTextModel
+## ClapTextModel
 
-[[autodoc]] CLAPTextModel
+[[autodoc]] ClapTextModel
     - forward
 
-## CLAPTextModelWithProjection
+## ClapTextModelWithProjection
 
-[[autodoc]] CLAPTextModelWithProjection
+[[autodoc]] ClapTextModelWithProjection
     - forward
 
-## CLAPAudioModel
+## ClapAudioModel
 
-[[autodoc]] CLAPAudioModel
+[[autodoc]] ClapAudioModel
     - forward
 
-## CLAPAudioModelWithProjection
+## ClapAudioModelWithProjection
 
-[[autodoc]] CLAPAudioModelWithProjection
+[[autodoc]] ClapAudioModelWithProjection
     - forward
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 043d649b2a49..00f8bc1d1c5a 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -207,12 +207,12 @@
         "ChineseCLIPVisionConfig",
     ],
     "models.clap": [
-        "CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "CLAPAudioConfig",
-        "CLAPConfig",
-        "CLAPFeatureExtractor",
-        "CLAPProcessor",
-        "CLAPTextConfig",
+        "Clap_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ClapAudioConfig",
+        "ClapConfig",
+        "ClapFeatureExtractor",
+        "ClapProcessor",
+        "ClapTextConfig",
     ],
     "models.clip": [
         "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -1232,13 +1232,13 @@
     )
     _import_structure["models.clap"].extend(
         [
-            "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "CLAPAudioModel",
-            "CLAPAudioModelWithProjection",
-            "CLAPModel",
-            "CLAPPreTrainedModel",
-            "CLAPTextModel",
-            "CLAPTextModelWithProjection",
+            "Clap_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ClapAudioModel",
+            "ClapAudioModelWithProjection",
+            "ClapModel",
+            "ClapPreTrainedModel",
+            "ClapTextModel",
+            "ClapTextModelWithProjection",
         ]
     )
     _import_structure["models.clip"].extend(
@@ -3707,12 +3707,12 @@
         ChineseCLIPVisionConfig,
     )
     from .models.clap import (
-        CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        CLAPAudioConfig,
-        CLAPConfig,
-        CLAPFeatureExtractor,
-        CLAPProcessor,
-        CLAPTextConfig,
+        Clap_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ClapAudioConfig,
+        ClapConfig,
+        ClapFeatureExtractor,
+        ClapProcessor,
+        ClapTextConfig,
     )
     from .models.clip import (
         CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4603,13 +4603,13 @@
             ChineseCLIPVisionModel,
         )
         from .models.clap import (
-            CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
-            CLAPAudioModel,
-            CLAPAudioModelWithProjection,
-            CLAPModel,
-            CLAPPreTrainedModel,
-            CLAPTextModel,
-            CLAPTextModelWithProjection,
+            Clap_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ClapAudioModel,
+            ClapAudioModelWithProjection,
+            ClapModel,
+            ClapPreTrainedModel,
+            ClapTextModel,
+            ClapTextModelWithProjection,
         )
         from .models.clip import (
             CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 9e2fb9ab83af..5cffd6216387 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -49,7 +49,7 @@
         ("camembert", "CamembertConfig"),
         ("canine", "CanineConfig"),
         ("chinese_clip", "ChineseCLIPConfig"),
-        ("clap", "CLAPConfig"),
+        ("clap", "ClapConfig"),
         ("clip", "CLIPConfig"),
         ("clipseg", "CLIPSegConfig"),
         ("codegen", "CodeGenConfig"),
@@ -220,7 +220,7 @@
         ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("chinese_clip", "CHINESE_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("clap", "CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("clap", "Clap_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clipseg", "CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("codegen", "CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 326c06c6207c..caf27f217660 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -40,7 +40,7 @@
         ("audio-spectrogram-transformer", "ASTFeatureExtractor"),
         ("beit", "BeitFeatureExtractor"),
         ("chinese_clip", "ChineseCLIPFeatureExtractor"),
-        ("clap", "CLAPFeatureExtractor"),
+        ("clap", "ClapFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
         ("clipseg", "ViTFeatureExtractor"),
         ("conditional_detr", "ConditionalDetrFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 1c875c82aa1f..d3eb1eccee31 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -47,7 +47,7 @@
         ("camembert", "CamembertModel"),
         ("canine", "CanineModel"),
         ("chinese_clip", "ChineseCLIPModel"),
-        ("clap", "CLAPModel"),
+        ("clap", "ClapModel"),
         ("clip", "CLIPModel"),
         ("clipseg", "CLIPSegModel"),
         ("codegen", "CodeGenModel"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 3753a51ecc69..bac6fe78c192 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -46,7 +46,7 @@
         ("blip-2", "Blip2Processor"),
         ("bridgetower", "BridgeTowerProcessor"),
         ("chinese_clip", "ChineseCLIPProcessor"),
-        ("clap", "CLAPProcessor"),
+        ("clap", "ClapProcessor"),
         ("clip", "CLIPProcessor"),
         ("clipseg", "CLIPSegProcessor"),
         ("flava", "FlavaProcessor"),
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index 36a3ec94237f..e834cb11aadd 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -22,14 +22,14 @@
 
 _import_structure = {
     "configuration_clap": [
-        "CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "CLAPAudioConfig",
-        "CLAPConfig",
-        "CLAPTextConfig",
+        "Clap_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ClapAudioConfig",
+        "ClapConfig",
+        "ClapTextConfig",
     ],
-    "feature_extraction_clap": ["CLAPFeatureExtractor"],
-    "processing_clap": ["CLAPProcessor"],
-    "tokenization_clap": ["CLAPTokenizer"],
+    "feature_extraction_clap": ["ClapFeatureExtractor"],
+    "processing_clap": ["ClapProcessor"],
+    "tokenization_clap": ["ClapTokenizer"],
 }
 
 try:
@@ -39,25 +39,25 @@
     pass
 else:
     _import_structure["modeling_clap"] = [
-        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "CLAPModel",
-        "CLAPPreTrainedModel",
-        "CLAPTextModel",
-        "CLAPTextModelWithProjection",
-        "CLAPAudioModel",
-        "CLAPAudioModelWithProjection",
+        "Clap_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ClapModel",
+        "ClapPreTrainedModel",
+        "ClapTextModel",
+        "ClapTextModelWithProjection",
+        "ClapAudioModel",
+        "ClapAudioModelWithProjection",
     ]
 
 if TYPE_CHECKING:
     from .configuration_clap import (
-        CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        CLAPAudioConfig,
-        CLAPConfig,
-        CLAPTextConfig,
+        Clap_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ClapAudioConfig,
+        ClapConfig,
+        ClapTextConfig,
     )
-    from .feature_extraction_clap import CLAPFeatureExtractor
-    from .processing_clap import CLAPProcessor
-    from .tokenization_clap import CLAPTokenizer
+    from .feature_extraction_clap import ClapFeatureExtractor
+    from .processing_clap import ClapProcessor
+    from .tokenization_clap import ClapTokenizer
 
     try:
         if not is_torch_available():
@@ -66,13 +66,13 @@
         pass
     else:
         from .modeling_clap import (
-            CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
-            CLAPAudioModel,
-            CLAPAudioModelWithProjection,
-            CLAPModel,
-            CLAPPreTrainedModel,
-            CLAPTextModel,
-            CLAPTextModelWithProjection,
+            Clap_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ClapAudioModel,
+            ClapAudioModelWithProjection,
+            ClapModel,
+            ClapPreTrainedModel,
+            ClapTextModel,
+            ClapTextModelWithProjection,
         )
 
 else:
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index f96fb3cb3142..dc27330ced6e 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -24,14 +24,14 @@
 
 logger = logging.get_logger(__name__)
 
-CLAP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+Clap_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/config.json",
 }
 
 
-class CLAPTextConfig(PretrainedConfig):
+class ClapTextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`CLAPTextModel`] or a [`TFCLAPTextModel`]. It is
+    This is the configuration class to store the configuration of a [`ClapTextModel`] or a [`TFClapTextModel`]. It is
     used to instantiate a RoBERTa model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
     [roberta-base](https://huggingface.co/roberta-base) architecture.
@@ -43,7 +43,7 @@ class CLAPTextConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the RoBERTa model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`CLAPTextModel`].
+            `inputs_ids` passed when calling [`ClapTextModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -63,7 +63,7 @@ class CLAPTextConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`CLAPTextModel`] or [`TFCLAPTextModel`].
+            The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`] or [`TFClapTextModel`].
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -85,13 +85,13 @@ class CLAPTextConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import CLAPTextConfig, CLAPTextModel
+    >>> from transformers import ClapTextConfig, ClapTextModel
 
     >>> # Initializing a RoBERTa configuration
-    >>> configuration = CLAPTextConfig()
+    >>> configuration = ClapTextConfig()
 
     >>> # Initializing a model (with random weights) from the configuration
-    >>> model = CLAPTextModel(configuration)
+    >>> model = ClapTextModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -148,7 +148,7 @@ def __init__(
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
-        # get the text config dict if we are loading from CLAPConfig
+        # get the text config dict if we are loading from ClapConfig
         if config_dict.get("model_type") == "clap":
             config_dict = config_dict["text_config"]
 
@@ -161,9 +161,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         return cls.from_dict(config_dict, **kwargs)
 
 
-class CLAPAudioConfig(PretrainedConfig):
+class ClapAudioConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`CLAPAudioModel`]. It is used to instantiate a
+    This is the configuration class to store the configuration of a [`ClapAudioModel`]. It is used to instantiate a
     Clap audio encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the audio encoder of the Clap
     [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
@@ -175,10 +175,10 @@ class CLAPAudioConfig(PretrainedConfig):
         window_size (`int`, *optional*, defaults to 8):
             [description]
         num_mel_bins (`int`, *optional*, defaults to 64):
-            Number of mel features used per frames. Should correspond to the value used in the `CLAPProcessor` class.
+            Number of mel features used per frames. Should correspond to the value used in the `ClapProcessor` class.
         spec_size (`int`, *optional*, defaults to 256):
             Desired input size of the spectrogram that the model supports. It can be different from the output of the
-            `CLAPFeatureExtractor`, in which case the input features will be resized. Corresponds to the `image_size`
+            `ClapFeatureExtractor`, in which case the input features will be resized. Corresponds to the `image_size`
             of the audio models.
         hidden_act (`str`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
@@ -236,13 +236,13 @@ class CLAPAudioConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import CLAPAudioConfig, CLAPAudioModel
+    >>> from transformers import ClapAudioConfig, ClapAudioModel
 
-    >>> # Initializing a CLAPAudioConfig with laion-ai/base style configuration
-    >>> configuration = CLAPAudioConfig()
+    >>> # Initializing a ClapAudioConfig with laion-ai/base style configuration
+    >>> configuration = ClapAudioConfig()
 
-    >>> # Initializing a CLAPAudioModel (with random weights) from the laion-ai/base style configuration
-    >>> model = CLAPAudioModel(configuration)
+    >>> # Initializing a ClapAudioModel (with random weights) from the laion-ai/base style configuration
+    >>> model = ClapAudioModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -315,7 +315,7 @@ def __init__(
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
-        # get the audio config dict if we are loading from CLAPConfig
+        # get the audio config dict if we are loading from ClapConfig
         if config_dict.get("model_type") == "clap":
             config_dict = config_dict["audio_config"]
 
@@ -328,9 +328,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         return cls.from_dict(config_dict, **kwargs)
 
 
-class CLAPConfig(PretrainedConfig):
+class ClapConfig(PretrainedConfig):
     r"""
-    [`CLAPConfig`] is the configuration class to store the configuration of a [`CLAPModel`]. It is used to instantiate
+    [`ClapConfig`] is the configuration class to store the configuration of a [`ClapModel`]. It is used to instantiate
     a Clap model according to the specified arguments, defining the text model and audio model configs. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Clap
     [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
@@ -340,9 +340,9 @@ class CLAPConfig(PretrainedConfig):
 
     Args:
         text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`CLAPTextConfig`].
+            Dictionary of configuration options used to initialize [`ClapTextConfig`].
         audio_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`CLAPAudioConfig`].
+            Dictionary of configuration options used to initialize [`ClapAudioConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
             Dimentionality of text and audio projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
@@ -361,25 +361,25 @@ class CLAPConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import CLAPConfig, CLAPModel
+    >>> from transformers import ClapConfig, ClapModel
 
-    >>> # Initializing a CLAPConfig with laion-ai/base style configuration
-    >>> configuration = CLAPConfig()
+    >>> # Initializing a ClapConfig with laion-ai/base style configuration
+    >>> configuration = ClapConfig()
 
-    >>> # Initializing a CLAPModel (with random weights) from the laion-ai/base style configuration
-    >>> model = CLAPModel(configuration)
+    >>> # Initializing a ClapModel (with random weights) from the laion-ai/base style configuration
+    >>> model = ClapModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
 
-    >>> # We can also initialize a CLAPConfig from a CLAPTextConfig and a CLAPAudioConfig
-    >>> from transformers import CLAPTextConfig, CLAPAudioConfig
+    >>> # We can also initialize a ClapConfig from a ClapTextConfig and a ClapAudioConfig
+    >>> from transformers import ClapTextConfig, ClapAudioConfig
 
-    >>> # Initializing a CLAPText and CLAPAudioConfig configuration
-    >>> config_text = CLAPTextConfig()
-    >>> config_audio = CLAPAudioConfig()
+    >>> # Initializing a ClapText and ClapAudioConfig configuration
+    >>> config_text = ClapTextConfig()
+    >>> config_audio = ClapAudioConfig()
 
-    >>> config = CLAPConfig.from_text_audio_configs(config_text, config_audio)
+    >>> config = ClapConfig.from_text_audio_configs(config_text, config_audio)
     ```"""
 
     model_type = "clap"
@@ -408,14 +408,14 @@ def __init__(
 
         if text_config is None:
             text_config = {}
-            logger.info("text_config is None. Initializing the CLAPTextConfig with default values.")
+            logger.info("text_config is None. Initializing the ClapTextConfig with default values.")
 
         if audio_config is None:
             audio_config = {}
-            logger.info("audio_config is None. initializing the CLAPAudioConfig with default values.")
+            logger.info("audio_config is None. initializing the ClapAudioConfig with default values.")
 
-        self.text_config = CLAPTextConfig(**text_config)
-        self.audio_config = CLAPAudioConfig(**audio_config)
+        self.text_config = ClapTextConfig(**text_config)
+        self.audio_config = ClapAudioConfig(**audio_config)
 
         self.text_config.fusion_num_hidden_layers = fusion_num_hidden_layers
         self.audio_config.fusion_num_hidden_layers = fusion_num_hidden_layers
@@ -435,13 +435,13 @@ def __init__(
         self.num_hidden_layers = self.text_config.num_hidden_layers + len(self.audio_config.depths)
 
     @classmethod
-    def from_text_audio_configs(cls, text_config: CLAPTextConfig, audio_config: CLAPAudioConfig, **kwargs):
+    def from_text_audio_configs(cls, text_config: ClapTextConfig, audio_config: ClapAudioConfig, **kwargs):
         r"""
-        Instantiate a [`CLAPConfig`] (or a derived class) from clap text model configuration and clap audio model
+        Instantiate a [`ClapConfig`] (or a derived class) from clap text model configuration and clap audio model
         configuration.
 
         Returns:
-            [`CLAPConfig`]: An instance of a configuration object
+            [`ClapConfig`]: An instance of a configuration object
         """
 
         return cls(text_config=text_config.to_dict(), audio_config=audio_config.to_dict(), **kwargs)
diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
index 62848f4615d0..979d18c71a91 100644
--- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+++ b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
@@ -19,7 +19,7 @@
 import torch
 from Clap import create_model
 
-from transformers import AutoFeatureExtractor, CLAPConfig, CLAPModel
+from transformers import AutoFeatureExtractor, ClapConfig, ClapModel
 
 
 KEYS_TO_MODIFY_MAPPING = {
@@ -101,9 +101,9 @@ def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_pa
     state_dict = clap_model.state_dict()
     state_dict = rename_state_dict(state_dict)
 
-    transformers_config = CLAPConfig()
+    transformers_config = ClapConfig()
     transformers_config.audio_config.enable_fusion = enable_fusion
-    model = CLAPModel(transformers_config)
+    model = ClapModel(transformers_config)
 
     model.load_state_dict(state_dict, strict=False)
 
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 2611eee58c0e..7695f575d103 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -29,11 +29,11 @@
 logger = logging.get_logger(__name__)
 
 
-class CLAPFeatureExtractor(SequenceFeatureExtractor):
+class ClapFeatureExtractor(SequenceFeatureExtractor):
     r"""
     Constructs a Clap feature extractor.
 
-    This feature extractor inherits from [`CLAPFeatureExtractor`] which contains most of the main methods. Users should
+    This feature extractor inherits from [`ClapFeatureExtractor`] which contains most of the main methods. Users should
     refer to this superclass for more information regarding those methods.
 
     This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index a017f19fa880..42cdb39a4fc2 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -38,14 +38,14 @@
     logging,
     replace_return_docstrings,
 )
-from .configuration_clap import CLAPAudioConfig, CLAPConfig, CLAPTextConfig
+from .configuration_clap import ClapAudioConfig, ClapConfig, ClapTextConfig
 
 
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "laion-ai/clap-htsat-fused"
 
-CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+Clap_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "laion-ai/clap-htsat-fused",
     "laion-ai/clap-htsat-unfused",
     # See all clap models at https://huggingface.co/models?filter=clap
@@ -157,7 +157,7 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
 
 @dataclass
 # Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Clap
-class CLAPTextModelOutput(ModelOutput):
+class ClapTextModelOutput(ModelOutput):
     """
     Base class for text model's outputs that also contains a pooling of the last hidden states.
 
@@ -186,9 +186,9 @@ class CLAPTextModelOutput(ModelOutput):
 
 
 @dataclass
-class CLAPAudioModelOutput(ModelOutput):
+class ClapAudioModelOutput(ModelOutput):
     """
-    CLAPAudio model output to mimic the output of the original implementation.
+    ClapAudio model output to mimic the output of the original implementation.
 
     Args:
         framewise_output (`torch.FloatTensor` of shape `(batch_size, num_frames, hidden_size)`):
@@ -221,9 +221,9 @@ class CLAPAudioModelOutput(ModelOutput):
 
 
 @dataclass
-class CLAPAudioModelOutputWithProjection(ModelOutput):
+class ClapAudioModelOutputWithProjection(ModelOutput):
     """
-    CLAPAudio model output to mimic the output of the original implementation.
+    ClapAudio model output to mimic the output of the original implementation.
 
     Args:
         audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
@@ -260,7 +260,7 @@ class CLAPAudioModelOutputWithProjection(ModelOutput):
 
 @dataclass
 # Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Clap, vision->audio, Vision->Audio, image->audio
-class CLAPOutput(ModelOutput):
+class ClapOutput(ModelOutput):
     """
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
@@ -272,13 +272,13 @@ class CLAPOutput(ModelOutput):
             The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
             similarity scores.
         text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The text embeddings obtained by applying the projection layer to the pooled output of [`CLAPTextModel`].
+            The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
         audio_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The audio embeddings obtained by applying the projection layer to the pooled output of [`CLAPAudioModel`].
+            The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
         text_model_output(`BaseModelOutputWithPooling`):
-            The output of the [`CLAPTextModel`].
+            The output of the [`ClapTextModel`].
         audio_model_output(`BaseModelOutputWithPooling`):
-            The output of the [`CLAPAudioModel`].
+            The output of the [`ClapAudioModel`].
     """
 
     loss: Optional[torch.FloatTensor] = None
@@ -297,7 +297,7 @@ def to_tuple(self) -> Tuple[Any]:
 
 
 # Adapted from transformers.models.swin.modeling_swin.SwinDropPath
-class CLAPDropPath(nn.Module):
+class ClapDropPath(nn.Module):
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
     refactored version of the `SwinDropPath` implementation.
@@ -322,12 +322,12 @@ def forward(self, hidden_states):
 
 
 # Adapted from https://github.com/LAION-AI/Clap/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133
-class CLAPAudioAFFBlock(nn.Module):
+class ClapAudioAFFBlock(nn.Module):
     r"""
     AFF Block from Clap, since in Clap we are always in 2D mode, it is not needed to implement the 1D version.
     """
 
-    def __init__(self, config: CLAPAudioConfig):
+    def __init__(self, config: ClapAudioConfig):
         super().__init__()
         channels = config.patch_embeds_hidden_size
         downsize_ratio = config.aff_block_r
@@ -361,13 +361,13 @@ def forward(self, hidden_states, residual):
         return output
 
 
-class CLAPAudioPatchEmbed(nn.Module):
+class ClapAudioPatchEmbed(nn.Module):
     """
     This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
     Transformer block.
     """
 
-    def __init__(self, config: CLAPAudioConfig):
+    def __init__(self, config: ClapAudioConfig):
         super().__init__()
         img_size = (config.spec_size, config.spec_size) if isinstance(config.spec_size, int) else config.spec_size
         patch_size = (
@@ -400,7 +400,7 @@ def __init__(self, config: CLAPAudioConfig):
 
         self.norm = nn.LayerNorm(config.patch_embeds_hidden_size) if config.enable_patch_layer_norm else nn.Identity()
         if self.enable_fusion:
-            self.fusion_model = CLAPAudioAFFBlock(config)
+            self.fusion_model = ClapAudioAFFBlock(config)
             self.mel_conv2d = nn.Conv2d(
                 config.patch_embed_input_channels,
                 config.patch_embeds_hidden_size,
@@ -475,8 +475,8 @@ def forward(self, hidden_states, is_longer_idx=None):
         return hidden_states
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->CLAPAudio
-class CLAPAudioSelfAttention(nn.Module):
+# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->ClapAudio
+class ClapAudioSelfAttention(nn.Module):
     def __init__(self, config, dim, num_heads, window_size):
         super().__init__()
         if dim % num_heads != 0:
@@ -547,7 +547,7 @@ def forward(
         attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
 
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in CLAPAudioModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in ClapAudioModel forward() function)
             mask_shape = attention_mask.shape[0]
             attention_scores = attention_scores.view(
                 batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
@@ -576,8 +576,8 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->CLAPAudio
-class CLAPAudioSelfOutput(nn.Module):
+# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->ClapAudio
+class ClapAudioSelfOutput(nn.Module):
     def __init__(self, config, dim):
         super().__init__()
         self.dense = nn.Linear(dim, dim)
@@ -590,12 +590,12 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->CLAPAudio
-class CLAPAudioAttention(nn.Module):
+# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->ClapAudio
+class ClapAudioAttention(nn.Module):
     def __init__(self, config, dim, num_heads, window_size):
         super().__init__()
-        self.self = CLAPAudioSelfAttention(config, dim, num_heads, window_size)
-        self.output = CLAPAudioSelfOutput(config, dim)
+        self.self = ClapAudioSelfAttention(config, dim, num_heads, window_size)
+        self.output = ClapAudioSelfOutput(config, dim)
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
@@ -629,8 +629,8 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinIntermediate with Swin->CLAPAudio
-class CLAPAudioIntermediate(nn.Module):
+# Copied from transformers.models.swin.modeling_swin.SwinIntermediate with Swin->ClapAudio
+class ClapAudioIntermediate(nn.Module):
     def __init__(self, config, dim):
         super().__init__()
         self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
@@ -645,8 +645,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinOutput with Swin->CLAPAudio
-class CLAPAudioOutput(nn.Module):
+# Copied from transformers.models.swin.modeling_swin.SwinOutput with Swin->ClapAudio
+class ClapAudioOutput(nn.Module):
     def __init__(self, config, dim):
         super().__init__()
         self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
@@ -658,8 +658,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinLayer with SwinDropPath->CLAPDropPath, Swin->CLAPAudio
-class CLAPAudioLayer(nn.Module):
+# Copied from transformers.models.swin.modeling_swin.SwinLayer with SwinDropPath->ClapDropPath, Swin->ClapAudio
+class ClapAudioLayer(nn.Module):
     def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
@@ -667,11 +667,11 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
         self.window_size = config.window_size
         self.input_resolution = input_resolution
         self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
-        self.attention = CLAPAudioAttention(config, dim, num_heads, window_size=self.window_size)
-        self.drop_path = CLAPDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.attention = ClapAudioAttention(config, dim, num_heads, window_size=self.window_size)
+        self.drop_path = ClapDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
         self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
-        self.intermediate = CLAPAudioIntermediate(config, dim)
-        self.output = CLAPAudioOutput(config, dim)
+        self.intermediate = ClapAudioIntermediate(config, dim)
+        self.output = ClapAudioOutput(config, dim)
 
     def set_shift_and_window_size(self, input_resolution):
         if min(input_resolution) <= self.window_size:
@@ -782,15 +782,15 @@ def forward(
         return layer_outputs
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->CLAPAudio
-class CLAPAudioStage(nn.Module):
+# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->ClapAudio
+class ClapAudioStage(nn.Module):
     def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
         super().__init__()
         self.config = config
         self.dim = dim
         self.blocks = nn.ModuleList(
             [
-                CLAPAudioLayer(
+                ClapAudioLayer(
                     config=config,
                     dim=dim,
                     input_resolution=input_resolution,
@@ -842,8 +842,8 @@ def forward(
         return stage_outputs
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging with Swin->CLAPAudio
-class CLAPAudioPatchMerging(nn.Module):
+# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging with Swin->ClapAudio
+class ClapAudioPatchMerging(nn.Module):
     """
     Patch Merging Layer.
 
@@ -897,13 +897,13 @@ def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]
         return input_feature
 
 
-class CLAPAudioEncoder(nn.Module):
+class ClapAudioEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.num_layers = len(config.depths)
 
         self.config = config
-        self.patch_embed = CLAPAudioPatchEmbed(config)
+        self.patch_embed = ClapAudioPatchEmbed(config)
         self.enable_fusion = config.enable_fusion
         grid_size = self.patch_embed.grid_size
         self.patch_stride = self.patch_embed.patch_stride
@@ -919,14 +919,14 @@ def __init__(self, config):
 
         self.layers = nn.ModuleList(
             [
-                CLAPAudioStage(
+                ClapAudioStage(
                     config=config,
                     dim=int(config.hidden_size * 2**i_layer),
                     input_resolution=self.input_resolutions[i_layer],
                     depth=config.depths[i_layer],
                     num_heads=config.num_attention_heads[i_layer],
                     drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
-                    downsample=CLAPAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
+                    downsample=ClapAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
                 )
                 for i_layer in range(self.num_layers)
             ]
@@ -953,7 +953,7 @@ def __init__(self, config):
     def reshape_mel2img(self, normalixed_input_features):
         """
         The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
-        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the `CLAPFeatureExtracor`.
+        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the `ClapFeatureExtracor`.
         """
         _, _, time_steps, freq_steps = normalixed_input_features.shape
 
@@ -1008,7 +1008,7 @@ def forward(
         output_hidden_states_before_downsampling: Optional[bool] = False,
         always_partition: Optional[bool] = False,
         return_dict: Optional[bool] = True,
-    ) -> Union[Tuple, CLAPAudioModelOutput]:
+    ) -> Union[Tuple, ClapAudioModelOutput]:
         input_features = input_features.transpose(1, 3)
         normalixed_input_features = self.bn0(input_features)
         normalixed_input_features = normalixed_input_features.transpose(1, 3)
@@ -1134,7 +1134,7 @@ def custom_forward(*inputs):
                 all_reshaped_hidden_states,
             )
 
-        return CLAPAudioModelOutput(
+        return ClapAudioModelOutput(
             framewise_output=framewise_output,
             clipwise_output=torch.sigmoid(hidden_states),
             fine_grained_embedding=fine_grained_latent_output,
@@ -1144,7 +1144,7 @@ def custom_forward(*inputs):
         )
 
 
-CLAP_START_DOCSTRING = r"""
+Clap_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -1154,12 +1154,12 @@ def custom_forward(*inputs):
     and behavior.
 
     Parameters:
-        config ([`CLAPConfig`]): Model configuration class with all the parameters of the model.
+        config ([`ClapConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-CLAP_TEXT_INPUTS_DOCSTRING = r"""
+Clap_TEXT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -1191,11 +1191,11 @@ def custom_forward(*inputs):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
-CLAP_AUDIO_INPUTS_DOCSTRING = r"""
+Clap_AUDIO_INPUTS_DOCSTRING = r"""
     Args:
         input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Input audio features. This should be returnes by the [`CLAPFeatureExtractor`] class that you can also
-            retrieve from [`AutoFeatureExtractor`]. See [`CLAPFeatureExtractor.__call__`] for details.
+            Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
+            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -1206,7 +1206,7 @@ def custom_forward(*inputs):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
-CLAP_INPUTS_DOCSTRING = r"""
+Clap_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -1229,8 +1229,8 @@ def custom_forward(*inputs):
 
             [What are position IDs?](../glossary#position-ids)
         input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Input audio features. This should be returnes by the [`CLAPFeatureExtractor`] class that you can also
-            retrieve from [`AutoFeatureExtractor`]. See [`CLAPFeatureExtractor.__call__`] for details.
+            Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
+            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
         output_attentions (`bool`, *optional*):
@@ -1244,8 +1244,8 @@ def custom_forward(*inputs):
 """
 
 
-class CLAPFusionBlock(nn.Module):
-    def __init__(self, config: CLAPTextConfig):
+class ClapFusionBlock(nn.Module):
+    def __init__(self, config: ClapTextConfig):
         super().__init__()
         self.config = config
         hidden_size = config.projection_dim
@@ -1261,8 +1261,8 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class CLAPProjectionLayer(nn.Module):
-    def __init__(self, config: CLAPAudioConfig):
+class ClapProjectionLayer(nn.Module):
+    def __init__(self, config: ClapAudioConfig):
         super().__init__()
         self.config = config
         hidden_size = config.projection_hidden_size
@@ -1279,12 +1279,12 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class CLAPFusionLayer(nn.Module):
-    def __init__(self, config: CLAPTextConfig):
+class ClapFusionLayer(nn.Module):
+    def __init__(self, config: ClapTextConfig):
         super().__init__()
         self.config = config
 
-        self.layers = nn.ModuleList([CLAPFusionBlock(config) for _ in range(config.fusion_num_hidden_layers)])
+        self.layers = nn.ModuleList([ClapFusionBlock(config) for _ in range(config.fusion_num_hidden_layers)])
 
     def forward(self, hidden_states):
         for layer in self.layers:
@@ -1292,8 +1292,8 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->CLAPText
-class CLAPTextEmbeddings(nn.Module):
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->ClapText
+class ClapTextEmbeddings(nn.Module):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
@@ -1380,8 +1380,8 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
         return position_ids.unsqueeze(0).expand(input_shape)
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->CLAPText
-class CLAPTextSelfAttention(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->ClapText
+class ClapTextSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -1488,7 +1488,7 @@ def forward(
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in CLAPTextModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in ClapTextModel forward() function)
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
@@ -1516,7 +1516,7 @@ def forward(
 
 
 # Copied from transformers.models.bert.modeling_bert.BertSelfOutput
-class CLAPTextSelfOutput(nn.Module):
+class ClapTextSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -1530,12 +1530,12 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->CLAPText
-class CLAPTextAttention(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->ClapText
+class ClapTextAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = CLAPTextSelfAttention(config, position_embedding_type=position_embedding_type)
-        self.output = CLAPTextSelfOutput(config)
+        self.self = ClapTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = ClapTextSelfOutput(config)
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
@@ -1581,7 +1581,7 @@ def forward(
 
 
 # Copied from transformers.models.bert.modeling_bert.BertIntermediate
-class CLAPTextIntermediate(nn.Module):
+class ClapTextIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -1597,7 +1597,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 # Copied from transformers.models.bert.modeling_bert.BertOutput
-class CLAPTextOutput(nn.Module):
+class ClapTextOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -1611,21 +1611,21 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->CLAPText
-class CLAPTextLayer(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->ClapText
+class ClapTextLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = CLAPTextAttention(config)
+        self.attention = ClapTextAttention(config)
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
             if not self.is_decoder:
                 raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
-            self.crossattention = CLAPTextAttention(config, position_embedding_type="absolute")
-        self.intermediate = CLAPTextIntermediate(config)
-        self.output = CLAPTextOutput(config)
+            self.crossattention = ClapTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = ClapTextIntermediate(config)
+        self.output = ClapTextOutput(config)
 
     def forward(
         self,
@@ -1698,12 +1698,12 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->CLAPText
-class CLAPTextEncoder(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->ClapText
+class ClapTextEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layer = nn.ModuleList([CLAPTextLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([ClapTextLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
@@ -1796,7 +1796,7 @@ def custom_forward(*inputs):
 
 
 # Copied from transformers.models.bert.modeling_bert.BertPooler
-class CLAPTextPooler(nn.Module):
+class ClapTextPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -1811,13 +1811,13 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return pooled_output
 
 
-class CLAPPreTrainedModel(PreTrainedModel):
+class ClapPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
-    config_class = CLAPTextConfig
+    config_class = ClapTextConfig
     base_model_prefix = "clap"
     supports_gradient_checkpointing = False
     _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"]
@@ -1826,11 +1826,11 @@ def _init_weights(self, module):
         """Initialize the weights"""
         factor = self.config.initializer_factor
 
-        if isinstance(module, CLAPTextEmbeddings):
+        if isinstance(module, ClapTextEmbeddings):
             module.word_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
             module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
             module.token_type_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
-        elif isinstance(module, CLAPTextSelfAttention):
+        elif isinstance(module, ClapTextSelfAttention):
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.query.weight, std=in_proj_std)
             nn.init.normal_(module.key.weight, std=in_proj_std)
@@ -1838,36 +1838,36 @@ def _init_weights(self, module):
         elif isinstance(
             module,
             (
-                CLAPTextSelfOutput,
-                CLAPTextOutput,
-                CLAPTextIntermediate,
-                CLAPTextPooler,
-                CLAPAudioSelfOutput,
-                CLAPAudioIntermediate,
-                CLAPAudioOutput,
+                ClapTextSelfOutput,
+                ClapTextOutput,
+                ClapTextIntermediate,
+                ClapTextPooler,
+                ClapAudioSelfOutput,
+                ClapAudioIntermediate,
+                ClapAudioOutput,
             ),
         ):
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.dense.weight, std=in_proj_std)
-        elif isinstance(module, CLAPProjectionLayer):
+        elif isinstance(module, ClapProjectionLayer):
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.linear1.weight, std=in_proj_std)
             nn.init.normal_(module.linear2.weight, std=in_proj_std)
-        elif isinstance(module, CLAPAudioPatchEmbed):
+        elif isinstance(module, ClapAudioPatchEmbed):
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.proj.weight, std=in_proj_std)
-        elif isinstance(module, CLAPAudioSelfAttention):
+        elif isinstance(module, ClapAudioSelfAttention):
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.query.weight, std=in_proj_std)
             nn.init.normal_(module.key.weight, std=in_proj_std)
             nn.init.normal_(module.value.weight, std=in_proj_std)
-        elif isinstance(module, CLAPAudioPatchMerging):
+        elif isinstance(module, ClapAudioPatchMerging):
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.reduction.weight, std=in_proj_std)
-        elif isinstance(module, CLAPAudioEncoder):
+        elif isinstance(module, ClapAudioEncoder):
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.head.weight, std=in_proj_std)
-        elif isinstance(module, CLAPFusionBlock):
+        elif isinstance(module, ClapFusionBlock):
             nn.init.normal_(module.linear.weight, std=factor * 0.02)
 
         if isinstance(module, nn.LayerNorm):
@@ -1880,30 +1880,30 @@ def _init_weights(self, module):
             nn.init.normal_(module.weight, std=in_proj_std)
             if module.bias is not None:
                 module.bias.data.zero_()
-        if isinstance(module, CLAPModel):
+        if isinstance(module, ClapModel):
             nn.init.normal_(module.logit_scale_a, std=factor * 0.02)
             nn.init.normal_(module.logit_scale_t, std=factor * 0.02)
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, CLAPTextEncoder):
+        if isinstance(module, ClapTextEncoder):
             module.gradient_checkpointing = value
 
 
-class CLAPAudioModel(CLAPPreTrainedModel):
-    config_class = CLAPAudioConfig
+class ClapAudioModel(ClapPreTrainedModel):
+    config_class = ClapAudioConfig
     main_input_name = "input_features"
 
-    def __init__(self, config: CLAPAudioConfig):
+    def __init__(self, config: ClapAudioConfig):
         super().__init__(config)
-        self.audio_encoder = CLAPAudioEncoder(config)
+        self.audio_encoder = ClapAudioEncoder(config)
         # Initialize weights and apply final processing
         self.post_init()
 
     def get_input_embeddings(self) -> nn.Module:
         return self.audio_encoder.patch_embed.proj
 
-    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLAPAudioConfig)
+    @add_start_docstrings_to_model_forward(Clap_AUDIO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ClapAudioConfig)
     def forward(
         self,
         input_features: Optional[torch.FloatTensor] = None,
@@ -1919,12 +1919,12 @@ def forward(
 
         ```python
         >>> from datasets import load_dataset
-        >>> from transformers import AutoProcessor, CLAPAudioModel
+        >>> from transformers import AutoProcessor, ClapAudioModel
 
         >>> dataset = load_dataset("ashraq/esc50")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
-        >>> model = CLAPAudioModel.from_pretrained("laionai/clap-hsat-fused")
+        >>> model = ClapAudioModel.from_pretrained("laionai/clap-hsat-fused")
         >>> processor = AutoProcessor.from_pretrained("laionai/clap-hsat-fused")
 
         >>> inputs = processor(audios=audio_sample, return_tensors="pt")
@@ -1947,7 +1947,7 @@ def forward(
         )
 
 
-class CLAPTextModel(CLAPPreTrainedModel):
+class ClapTextModel(ClapPreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
@@ -1963,18 +1963,18 @@ class CLAPTextModel(CLAPPreTrainedModel):
 
     """
 
-    config_class = CLAPTextConfig
+    config_class = ClapTextConfig
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->CLAPText
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->ClapText
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = CLAPTextEmbeddings(config)
-        self.encoder = CLAPTextEncoder(config)
+        self.embeddings = ClapTextEmbeddings(config)
+        self.encoder = ClapTextEncoder(config)
 
-        self.pooler = CLAPTextPooler(config) if add_pooling_layer else None
+        self.pooler = ClapTextPooler(config) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -2116,22 +2116,22 @@ def forward(
         )
 
 
-@add_start_docstrings(CLAP_START_DOCSTRING)
-class CLAPModel(CLAPPreTrainedModel):
-    config_class = CLAPConfig
+@add_start_docstrings(Clap_START_DOCSTRING)
+class ClapModel(ClapPreTrainedModel):
+    config_class = ClapConfig
 
-    def __init__(self, config: CLAPConfig):
+    def __init__(self, config: ClapConfig):
         super().__init__(config)
 
-        if not isinstance(config.text_config, CLAPTextConfig):
+        if not isinstance(config.text_config, ClapTextConfig):
             raise ValueError(
-                "config.text_config is expected to be of type CLAPTextConfig but is of type"
+                "config.text_config is expected to be of type ClapTextConfig but is of type"
                 f" {type(config.text_config)}."
             )
 
-        if not isinstance(config.audio_config, CLAPAudioConfig):
+        if not isinstance(config.audio_config, ClapAudioConfig):
             raise ValueError(
-                "config.audio_config is expected to be of type CLAPAudioConfig but is of type"
+                "config.audio_config is expected to be of type ClapAudioConfig but is of type"
                 f" {type(config.audio_config)}."
             )
 
@@ -2145,18 +2145,18 @@ def __init__(self, config: CLAPConfig):
         self.text_hidden_size = text_config.hidden_size
         self.audio_hidden_size = audio_config.hidden_size
 
-        self.text_model = CLAPTextModel(text_config)
-        self.text_transform = CLAPFusionLayer(text_config)
-        self.text_projection = CLAPProjectionLayer(text_config)
+        self.text_model = ClapTextModel(text_config)
+        self.text_transform = ClapFusionLayer(text_config)
+        self.text_projection = ClapProjectionLayer(text_config)
 
-        self.audio_model = CLAPAudioModel(config=audio_config)
-        self.audio_transform = CLAPFusionLayer(audio_config)
-        self.audio_projection = CLAPProjectionLayer(audio_config)
+        self.audio_model = ClapAudioModel(config=audio_config)
+        self.audio_transform = ClapFusionLayer(audio_config)
+        self.audio_projection = ClapProjectionLayer(audio_config)
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(Clap_TEXT_INPUTS_DOCSTRING)
     def get_text_features(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -2169,14 +2169,14 @@ def get_text_features(
         r"""
         Returns:
             text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
-            applying the projection layer to the pooled output of [`CLAPTextModel`].
+            applying the projection layer to the pooled output of [`ClapTextModel`].
 
         Examples:
 
         ```python
-        >>> from transformers import AutoTokenizer, CLAPModel
+        >>> from transformers import AutoTokenizer, ClapModel
 
-        >>> model = CLAPModel.from_pretrained("laion-ai/clap-htsat-unfused")
+        >>> model = ClapModel.from_pretrained("laion-ai/clap-htsat-unfused")
         >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htsat-unfused")
 
         >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
@@ -2204,7 +2204,7 @@ def get_text_features(
 
         return text_features
 
-    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(Clap_AUDIO_INPUTS_DOCSTRING)
     def get_audio_features(
         self,
         input_features: Optional[torch.Tensor] = None,
@@ -2234,8 +2234,8 @@ def get_audio_features(
 
         return audio_features
 
-    @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CLAPOutput, config_class=CLAPConfig)
+    @add_start_docstrings_to_model_forward(Clap_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ClapOutput, config_class=ClapConfig)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -2246,7 +2246,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CLAPOutput]:
+    ) -> Union[Tuple, ClapOutput]:
         r"""
         Returns:
 
@@ -2254,12 +2254,12 @@ def forward(
 
         ```python
         >>> from dataset import load_dataset
-        >>> from transformers import AutoProcessor, CLAPModel
+        >>> from transformers import AutoProcessor, ClapModel
 
         >>> dataset = load_dataset("ashraq/esc50")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
-        >>> model = CLAPModel.from_pretrained("laion-ai/clap-htst-unfused-base")
+        >>> model = ClapModel.from_pretrained("laion-ai/clap-htst-unfused-base")
         >>> processor = AutoProcessor.from_pretrained("laion-ai/clap-htst-unfused-base")
 
         >>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]
@@ -2319,7 +2319,7 @@ def forward(
             output = (logits_per_audio, logits_per_text, text_embeds, audio_embeds, text_outputs, audio_outputs)
             return ((loss,) + output) if loss is not None else output
 
-        return CLAPOutput(
+        return ClapOutput(
             loss=loss,
             logits_per_audio=logits_per_audio,
             logits_per_text=logits_per_text,
@@ -2334,15 +2334,15 @@ def forward(
     """
     Clap Text Model with a projection layer on top (a linear layer on top of the pooled output).
     """,
-    CLAP_START_DOCSTRING,
+    Clap_START_DOCSTRING,
 )
-class CLAPTextModelWithProjection(CLAPPreTrainedModel):
-    config_class = CLAPTextConfig
+class ClapTextModelWithProjection(ClapPreTrainedModel):
+    config_class = ClapTextConfig
 
-    def __init__(self, config: CLAPTextConfig):
+    def __init__(self, config: ClapTextConfig):
         super().__init__(config)
-        self.text_model = CLAPTextModel(config)
-        self.text_projection = CLAPProjectionLayer(config)
+        self.text_model = ClapTextModel(config)
+        self.text_projection = ClapProjectionLayer(config)
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -2352,8 +2352,8 @@ def get_input_embeddings(self) -> nn.Module:
     def set_input_embeddings(self, value):
         self.text_model.embeddings.word_embeddings = value
 
-    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CLAPTextModelOutput, config_class=CLAPTextConfig)
+    @add_start_docstrings_to_model_forward(Clap_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ClapTextModelOutput, config_class=ClapTextConfig)
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -2362,16 +2362,16 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CLAPTextModelOutput]:
+    ) -> Union[Tuple, ClapTextModelOutput]:
         r"""
         Returns:
 
         Examples:
 
         ```python
-        >>> from transformers import AutoTokenizer, CLAPTextModelWithProjection
+        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection
 
-        >>> model = CLAPTextModelWithProjection.from_pretrained("laion-ai/clap-htsat-unfused")
+        >>> model = ClapTextModelWithProjection.from_pretrained("laion-ai/clap-htsat-unfused")
         >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htsat-unfused")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
@@ -2398,7 +2398,7 @@ def forward(
             outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
             return tuple(output for output in outputs if output is not None)
 
-        return CLAPTextModelOutput(
+        return ClapTextModelOutput(
             text_embeds=text_embeds,
             last_hidden_state=text_outputs.last_hidden_state,
             hidden_states=text_outputs.hidden_states,
@@ -2410,33 +2410,33 @@ def forward(
     """
     Clap Audio Model with a projection layer on top (a linear layer on top of the pooled output).
     """,
-    CLAP_START_DOCSTRING,
+    Clap_START_DOCSTRING,
 )
-class CLAPAudioModelWithProjection(CLAPPreTrainedModel):
-    config_class = CLAPAudioConfig
+class ClapAudioModelWithProjection(ClapPreTrainedModel):
+    config_class = ClapAudioConfig
     main_input_name = "input_features"
 
-    def __init__(self, config: CLAPAudioConfig):
+    def __init__(self, config: ClapAudioConfig):
         super().__init__(config)
 
-        self.audio_model = CLAPAudioModel(config)
+        self.audio_model = ClapAudioModel(config)
 
-        self.audio_projection = CLAPProjectionLayer(config)
+        self.audio_projection = ClapProjectionLayer(config)
         # Initialize weights and apply final processing
         self.post_init()
 
     def get_input_embeddings(self) -> nn.Module:
         return self.audio_model.audio_encoder.patch_embed.proj
 
-    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CLAPAudioModelOutput, config_class=CLAPAudioConfig)
+    @add_start_docstrings_to_model_forward(Clap_AUDIO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ClapAudioModelOutput, config_class=ClapAudioConfig)
     def forward(
         self,
         input_features: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CLAPAudioModelOutput]:
+    ) -> Union[Tuple, ClapAudioModelOutput]:
         r"""
         Returns:
 
@@ -2444,10 +2444,10 @@ def forward(
 
         ```python
         >>> from datasets import load_dataset
-        >>> from transformers import CLAPAudioModelWithProjection, CLAPProcessor
+        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor
 
-        >>> model = CLAPAudioModelWithProjection.from_pretrained("laion-ai/clap-htsat-unfused")
-        >>> processor = CLAPProcessor.from_pretrained("laion-ai/clap-htsat-unfused")
+        >>> model = ClapAudioModelWithProjection.from_pretrained("laion-ai/clap-htsat-unfused")
+        >>> processor = ClapProcessor.from_pretrained("laion-ai/clap-htsat-unfused")
 
         >>> dataset = load_dataset("ashraq/esc50")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
@@ -2477,7 +2477,7 @@ def forward(
             outputs = (audio_embeds, *audio_outputs)
             return outputs
 
-        return CLAPAudioModelOutputWithProjection(
+        return ClapAudioModelOutputWithProjection(
             audio_embeds=audio_embeds,
             framewise_output=audio_outputs.framewise_output,
             clipwise_output=audio_outputs.clipwise_output,
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index 755147af2184..36d9ff24ea77 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -20,20 +20,20 @@
 from ...tokenization_utils_base import BatchEncoding
 
 
-class CLAPProcessor(ProcessorMixin):
+class ClapProcessor(ProcessorMixin):
     r"""
     Constructs a Clap processor which wraps a Clap feature extractor and a Clap tokenizer into a single processor.
 
-    [`CLAPProcessor`] offers all the functionalities of [`CLAPFeatureExtractor`] and [`CLAPTokenizerFast`]. See the
-    [`~CLAPProcessor.__call__`] and [`~CLAPProcessor.decode`] for more information.
+    [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`ClapTokenizerFast`]. See the
+    [`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information.
 
     Args:
-        feature_extractor ([`CLAPFeatureExtractor`]):
+        feature_extractor ([`ClapFeatureExtractor`]):
             The audio processor is a required input.
-        tokenizer ([`CLAPTokenizerFast`]):
+        tokenizer ([`ClapTokenizerFast`]):
             The tokenizer is a required input.
     """
-    feature_extractor_class = "CLAPFeatureExtractor"
+    feature_extractor_class = "ClapFeatureExtractor"
     tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
 
     def __init__(self, feature_extractor, tokenizer):
@@ -42,9 +42,9 @@ def __init__(self, feature_extractor, tokenizer):
     def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
         """
         Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
-        and `kwargs` arguments to CLAPTokenizerFast's [`~CLAPTokenizerFast.__call__`] if `text` is not `None` to encode
+        and `kwargs` arguments to ClapTokenizerFast's [`~ClapTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
-        CLAPFeatureExtractor's [`~CLAPFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
+        ClapFeatureExtractor's [`~ClapFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
         doctsring of the above two methods for more information.
 
         Args:
@@ -97,14 +97,14 @@ def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
 
     def batch_decode(self, *args, **kwargs):
         """
-        This method forwards all its arguments to CLAPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        This method forwards all its arguments to ClapTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
         refer to the docstring of this method for more information.
         """
         return self.tokenizer.batch_decode(*args, **kwargs)
 
     def decode(self, *args, **kwargs):
         """
-        This method forwards all its arguments to CLAPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        This method forwards all its arguments to ClapTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
         the docstring of this method for more information.
         """
         return self.tokenizer.decode(*args, **kwargs)
diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
index aa310dd1c4f3..7ac9eeb6b8c8 100644
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -22,7 +22,7 @@
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class ZeroShotAudioClassificationPipeline(ChunkPipeline):
     """
-    Zero shot audio classification pipeline using `CLAPModel`. This pipeline predicts the class of an audio when you
+    Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
     provide an audio and a set of `candidate_labels`.
 
     Example:
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 6c4401e42710..7470ec76ffc1 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1464,45 +1464,45 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = None
+Clap_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class CLAPAudioModel(metaclass=DummyObject):
+class ClapAudioModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CLAPAudioModelWithProjection(metaclass=DummyObject):
+class ClapAudioModelWithProjection(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CLAPModel(metaclass=DummyObject):
+class ClapModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CLAPPreTrainedModel(metaclass=DummyObject):
+class ClapPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CLAPTextModel(metaclass=DummyObject):
+class ClapTextModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CLAPTextModelWithProjection(metaclass=DummyObject):
+class ClapTextModelWithProjection(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
index 0b7f3896758b..307d5b2a28b6 100644
--- a/tests/models/clap/test_feature_extraction_clap.py
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from transformers import CLAPFeatureExtractor
+from transformers import ClapFeatureExtractor
 from transformers.testing_utils import require_torch, require_torchaudio
 from transformers.utils.import_utils import is_torch_available
 
@@ -51,7 +51,7 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
 @require_torch
 @require_torchaudio
 # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester with Whisper->Clap
-class CLAPFeatureExtractionTester(unittest.TestCase):
+class ClapFeatureExtractionTester(unittest.TestCase):
     def __init__(
         self,
         parent,
@@ -110,11 +110,11 @@ def _flatten(list_of_lists):
 @require_torch
 @require_torchaudio
 # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest with Whisper->Clap
-class CLAPFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = CLAPFeatureExtractor
+class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = ClapFeatureExtractor
 
     def setUp(self):
-        self.feat_extract_tester = CLAPFeatureExtractionTester(self)
+        self.feat_extract_tester = ClapFeatureExtractionTester(self)
 
     def test_call(self):
         # Tests that all call wrap to encode_plus and batch_encode_plus
@@ -206,7 +206,7 @@ def integration_test_fusion(self):
         # fmt: on
         MEL_BIN = [963, 963, 161]
         input_speech = self._load_datasamples(1)
-        feaure_extractor = CLAPFeatureExtractor()
+        feaure_extractor = ClapFeatureExtractor()
         for padding, EXPECTED_VALUES, idx_in_mel in zip(
             ["repeat", "repeatpad", None], EXPECTED_INPUT_FEATURES, MEL_BIN
         ):
@@ -259,7 +259,7 @@ def integration_test_rand_trunc(self):
         # fmt: on
 
         input_speech = self._load_datasamples(1)
-        feaure_extractor = CLAPFeatureExtractor()
+        feaure_extractor = ClapFeatureExtractor()
         for padding, EXPECTED_VALUES in zip(["repeat", "repeatpad", None], EXPECTED_INPUT_FEATURES):
             input_features = feaure_extractor(
                 input_speech, return_tensors="pt", truncation="rand_trunc", padding=padding
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 5b5708f4a039..58d69a604bd6 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -23,7 +23,7 @@
 import numpy as np
 from datasets import load_dataset
 
-from transformers import CLAPAudioConfig, CLAPConfig, CLAPProcessor, CLAPTextConfig
+from transformers import ClapAudioConfig, ClapConfig, ClapProcessor, ClapTextConfig
 from transformers.testing_utils import require_torch, slow, torch_device
 from transformers.utils import is_torch_available
 
@@ -42,16 +42,16 @@
     from torch import nn
 
     from transformers import (
-        CLAPAudioModel,
-        CLAPAudioModelWithProjection,
-        CLAPModel,
-        CLAPTextModel,
-        CLAPTextModelWithProjection,
+        ClapAudioModel,
+        ClapAudioModelWithProjection,
+        ClapModel,
+        ClapTextModel,
+        ClapTextModelWithProjection,
     )
-    from transformers.models.clap.modeling_clap import CLAP_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.models.clap.modeling_clap import Clap_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
-class CLAPAudioModelTester:
+class ClapAudioModelTester:
     def __init__(
         self,
         parent,
@@ -110,7 +110,7 @@ def prepare_config_and_inputs(self):
         return config, input_features
 
     def get_config(self):
-        return CLAPAudioConfig(
+        return ClapAudioConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_mel_bins=self.num_mel_bins,
@@ -132,7 +132,7 @@ def get_config(self):
         )
 
     def create_and_check_model(self, config, input_features):
-        model = CLAPAudioModel(config=config)
+        model = ClapAudioModel(config=config)
         model.to(torch_device)
         model.eval()
         with torch.no_grad():
@@ -144,7 +144,7 @@ def create_and_check_model(self, config, input_features):
         )
 
     def create_and_check_model_with_projection(self, config, input_features):
-        model = CLAPAudioModelWithProjection(config=config)
+        model = ClapAudioModelWithProjection(config=config)
         model.to(torch_device)
         model.eval()
         with torch.no_grad():
@@ -159,26 +159,26 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class CLAPAudioModelTest(ModelTesterMixin, unittest.TestCase):
+class ClapAudioModelTest(ModelTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as Clap does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
-    all_model_classes = (CLAPAudioModel, CLAPAudioModelWithProjection) if is_torch_available() else ()
+    all_model_classes = (ClapAudioModel, ClapAudioModelWithProjection) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
 
     def setUp(self):
-        self.model_tester = CLAPAudioModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CLAPAudioConfig, has_text_modality=False, hidden_size=37)
+        self.model_tester = ClapAudioModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ClapAudioConfig, has_text_modality=False, hidden_size=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="CLAPAudioModel does not use inputs_embeds")
+    @unittest.skip(reason="ClapAudioModel does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
@@ -224,7 +224,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip(reason="CLAPAudioModel does not output any loss term in the forward pass")
+    @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
@@ -248,37 +248,37 @@ def test_model_with_projection(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
 
-    @unittest.skip(reason="CLAPAudioModel does not output any loss term in the forward pass")
+    @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
     def test_training(self):
         pass
 
-    @unittest.skip(reason="CLAPAudioModel does not output any loss term in the forward pass")
+    @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(reason="CLAPAudioModel has no base class and is not available in MODEL_MAPPING")
+    @unittest.skip(reason="ClapAudioModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip(reason="CLAPAudioModel has no base class and is not available in MODEL_MAPPING")
+    @unittest.skip(reason="ClapAudioModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_to_base(self):
         pass
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = CLAPAudioModel.from_pretrained(model_name)
+        for model_name in Clap_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ClapAudioModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
     @slow
     def test_model_with_projection_from_pretrained(self):
-        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = CLAPAudioModelWithProjection.from_pretrained(model_name)
+        for model_name in Clap_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ClapAudioModelWithProjection.from_pretrained(model_name)
             self.assertIsNotNone(model)
             self.assertTrue(hasattr(model, "visual_projection"))
 
 
-class CLAPTextModelTester:
+class ClapTextModelTester:
     def __init__(
         self,
         parent,
@@ -338,7 +338,7 @@ def prepare_config_and_inputs(self):
         return config, input_ids, input_mask
 
     def get_config(self):
-        return CLAPTextConfig(
+        return ClapTextConfig(
             vocab_size=self.vocab_size,
             hidden_size=self.hidden_size,
             projection_hidden_size=self.hidden_size,
@@ -354,7 +354,7 @@ def get_config(self):
         )
 
     def create_and_check_model(self, config, input_ids, input_mask):
-        model = CLAPTextModel(config=config)
+        model = ClapTextModel(config=config)
         model.to(torch_device)
         model.eval()
         with torch.no_grad():
@@ -364,7 +364,7 @@ def create_and_check_model(self, config, input_ids, input_mask):
         self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
 
     def create_and_check_model_with_projection(self, config, input_ids, input_mask):
-        model = CLAPTextModelWithProjection(config=config)
+        model = ClapTextModelWithProjection(config=config)
         model.to(torch_device)
         model.eval()
         with torch.no_grad():
@@ -381,15 +381,15 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class CLAPTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CLAPTextModel, CLAPTextModelWithProjection) if is_torch_available() else ()
+class ClapTextModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (ClapTextModel, ClapTextModelWithProjection) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_head_masking = False
 
     def setUp(self):
-        self.model_tester = CLAPTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CLAPTextConfig, hidden_size=37)
+        self.model_tester = ClapTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ClapTextConfig, hidden_size=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -402,41 +402,41 @@ def test_model_with_projection(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
 
-    @unittest.skip(reason="CLAPTextModel does not output any loss term in the forward pass")
+    @unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass")
     def test_training(self):
         pass
 
-    @unittest.skip(reason="CLAPTextModel does not output any loss term in the forward pass")
+    @unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass")
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(reason="CLAPTextModel does not use inputs_embeds")
+    @unittest.skip(reason="ClapTextModel does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="CLAPTextModel has no base class and is not available in MODEL_MAPPING")
+    @unittest.skip(reason="ClapTextModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip(reason="CLAPTextModel has no base class and is not available in MODEL_MAPPING")
+    @unittest.skip(reason="ClapTextModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_to_base(self):
         pass
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = CLAPTextModel.from_pretrained(model_name)
+        for model_name in Clap_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ClapTextModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
     @slow
     def test_model_with_projection_from_pretrained(self):
-        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = CLAPTextModelWithProjection.from_pretrained(model_name)
+        for model_name in Clap_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ClapTextModelWithProjection.from_pretrained(model_name)
             self.assertIsNotNone(model)
             self.assertTrue(hasattr(model, "text_projection"))
 
 
-class CLAPModelTester:
+class ClapModelTester:
     def __init__(self, parent, text_kwargs=None, audio_kwargs=None, is_training=True):
         if text_kwargs is None:
             text_kwargs = {}
@@ -444,8 +444,8 @@ def __init__(self, parent, text_kwargs=None, audio_kwargs=None, is_training=True
             audio_kwargs = {}
 
         self.parent = parent
-        self.text_model_tester = CLAPTextModelTester(parent, **text_kwargs)
-        self.audio_model_tester = CLAPAudioModelTester(parent, **audio_kwargs)
+        self.text_model_tester = ClapTextModelTester(parent, **text_kwargs)
+        self.audio_model_tester = ClapAudioModelTester(parent, **audio_kwargs)
         self.is_training = is_training
 
     def prepare_config_and_inputs(self):
@@ -457,12 +457,12 @@ def prepare_config_and_inputs(self):
         return config, input_ids, attention_mask, input_features
 
     def get_config(self):
-        return CLAPConfig.from_text_audio_configs(
+        return ClapConfig.from_text_audio_configs(
             self.text_model_tester.get_config(), self.audio_model_tester.get_config(), projection_dim=64
         )
 
     def create_and_check_model(self, config, input_ids, attention_mask, input_features):
-        model = CLAPModel(config).to(torch_device).eval()
+        model = ClapModel(config).to(torch_device).eval()
         with torch.no_grad():
             result = model(input_ids, input_features, attention_mask)
         self.parent.assertEqual(
@@ -485,8 +485,8 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class CLAPModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CLAPModel,) if is_torch_available() else ()
+class ClapModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (ClapModel,) if is_torch_available() else ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -494,7 +494,7 @@ class CLAPModelTest(ModelTesterMixin, unittest.TestCase):
     test_attention_outputs = False
 
     def setUp(self):
-        self.model_tester = CLAPModelTester(self)
+        self.model_tester = ClapModelTester(self)
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -512,7 +512,7 @@ def test_inputs_embeds(self):
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    @unittest.skip(reason="CLAPModel does not have input/output embeddings")
+    @unittest.skip(reason="ClapModel does not have input/output embeddings")
     def test_model_common_attributes(self):
         pass
 
@@ -594,28 +594,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
     def test_load_audio_text_config(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        # Save CLAPConfig and check if we can load CLAPAudioConfig from it
+        # Save ClapConfig and check if we can load ClapAudioConfig from it
         with tempfile.TemporaryDirectory() as tmp_dir_name:
             config.save_pretrained(tmp_dir_name)
-            audio_config = CLAPAudioConfig.from_pretrained(tmp_dir_name)
+            audio_config = ClapAudioConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.audio_config.to_dict(), audio_config.to_dict())
 
-        # Save CLAPConfig and check if we can load CLAPTextConfig from it
+        # Save ClapConfig and check if we can load ClapTextConfig from it
         with tempfile.TemporaryDirectory() as tmp_dir_name:
             config.save_pretrained(tmp_dir_name)
-            text_config = CLAPTextConfig.from_pretrained(tmp_dir_name)
+            text_config = ClapTextConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = CLAPModel.from_pretrained(model_name)
+        for model_name in Clap_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ClapModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
 
 @slow
 @require_torch
-class CLAPModelIntegrationTest(unittest.TestCase):
+class ClapModelIntegrationTest(unittest.TestCase):
     paddings = ["repeatpad", "repeat", "pad"]
 
     def test_integration_unfused(self):
@@ -630,8 +630,8 @@ def test_integration_unfused(self):
 
         model_id = "ybelkada/clap-htsat-unfused"
 
-        model = CLAPModel.from_pretrained(model_id).to(torch_device)
-        processor = CLAPProcessor.from_pretrained(model_id)
+        model = ClapModel.from_pretrained(model_id).to(torch_device)
+        processor = ClapProcessor.from_pretrained(model_id)
 
         for padding in self.paddings:
             inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding).to(
@@ -657,8 +657,8 @@ def test_integration_fused(self):
 
         model_id = "ybelkada/clap-htsat-fused"
 
-        model = CLAPModel.from_pretrained(model_id).to(torch_device)
-        processor = CLAPProcessor.from_pretrained(model_id)
+        model = ClapModel.from_pretrained(model_id).to(torch_device)
+        processor = ClapProcessor.from_pretrained(model_id)
 
         for padding in self.paddings:
             inputs = processor(
diff --git a/tests/models/clap/test_processor_clap.py b/tests/models/clap/test_processor_clap.py
index 5e035f846e1e..026a5f9189e3 100644
--- a/tests/models/clap/test_processor_clap.py
+++ b/tests/models/clap/test_processor_clap.py
@@ -16,7 +16,7 @@
 import tempfile
 import unittest
 
-from transformers import CLAPFeatureExtractor, CLAPProcessor, RobertaTokenizer, RobertaTokenizerFast
+from transformers import ClapFeatureExtractor, ClapProcessor, RobertaTokenizer, RobertaTokenizerFast
 from transformers.testing_utils import require_sentencepiece, require_torchaudio
 
 from .test_feature_extraction_clap import floats_list
@@ -24,7 +24,7 @@
 
 @require_torchaudio
 @require_sentencepiece
-class CLAPProcessorTest(unittest.TestCase):
+class ClapProcessorTest(unittest.TestCase):
     def setUp(self):
         self.checkpoint = "ybelkada/clap-htsat-unfused"
         self.tmpdirname = tempfile.mkdtemp()
@@ -33,7 +33,7 @@ def get_tokenizer(self, **kwargs):
         return RobertaTokenizer.from_pretrained(self.checkpoint, **kwargs)
 
     def get_feature_extractor(self, **kwargs):
-        return CLAPFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
+        return ClapFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
 
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
@@ -42,25 +42,25 @@ def test_save_load_pretrained_default(self):
         tokenizer = self.get_tokenizer()
         feature_extractor = self.get_feature_extractor()
 
-        processor = CLAPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
         processor.save_pretrained(self.tmpdirname)
-        processor = CLAPProcessor.from_pretrained(self.tmpdirname)
+        processor = ClapProcessor.from_pretrained(self.tmpdirname)
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
         self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
 
         self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, CLAPFeatureExtractor)
+        self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor)
 
     def test_save_load_pretrained_additional_features(self):
-        processor = CLAPProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor = ClapProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
         processor.save_pretrained(self.tmpdirname)
 
         tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
         feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
 
-        processor = CLAPProcessor.from_pretrained(
+        processor = ClapProcessor.from_pretrained(
             self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
         )
 
@@ -68,13 +68,13 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
 
         self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, CLAPFeatureExtractor)
+        self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor)
 
     def test_feature_extractor(self):
         feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()
 
-        processor = CLAPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
         raw_speech = floats_list((3, 1000))
 
@@ -88,7 +88,7 @@ def test_tokenizer(self):
         feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()
 
-        processor = CLAPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
         input_str = "This is a test string"
 
@@ -103,7 +103,7 @@ def test_tokenizer_decode(self):
         feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()
 
-        processor = CLAPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
         predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
 
@@ -116,7 +116,7 @@ def test_model_input_names(self):
         feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()
 
-        processor = CLAPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
         self.assertListEqual(
             processor.model_input_names[2:],
diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index 5b13deac16b4..80e42124a886 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -26,7 +26,7 @@
 class ZeroShotAudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
     # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping,
     # and only Clap would be there for now.
-    # model_mapping = {CLAPConfig: CLAPModel}
+    # model_mapping = {ClapConfig: ClapModel}
 
     @require_torch
     def test_small_model_pt(self):
diff --git a/utils/check_repo.py b/utils/check_repo.py
index b0dcf76f4881..d922ceca3a69 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -172,10 +172,10 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
-    "CLAPTextModel",
-    "CLAPTextModelWithProjection",
-    "CLAPAudioModel",
-    "CLAPAudioModelWithProjection",
+    "ClapTextModel",
+    "ClapTextModelWithProjection",
+    "ClapAudioModel",
+    "ClapAudioModelWithProjection",
     "Blip2ForConditionalGeneration",
     "Blip2QFormerModel",
     "Blip2VisionModel",

From 6c31509200dd8dbb78b538119faeb275256e28ec Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 10 Feb 2023 11:09:06 +0000
Subject: [PATCH 136/197] replace to `Clap`

---
 src/transformers/__init__.py               | 4 ++--
 src/transformers/models/clap/__init__.py   | 2 +-
 src/transformers/utils/dummy_pt_objects.py | 6 +++++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 00f8bc1d1c5a..54773bf36c66 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -207,10 +207,10 @@
         "ChineseCLIPVisionConfig",
     ],
     "models.clap": [
-        "Clap_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "ClapAudioConfig",
         "ClapConfig",
         "ClapFeatureExtractor",
+        "Clap_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "ClapProcessor",
         "ClapTextConfig",
     ],
@@ -1232,11 +1232,11 @@
     )
     _import_structure["models.clap"].extend(
         [
-            "Clap_PRETRAINED_MODEL_ARCHIVE_LIST",
             "ClapAudioModel",
             "ClapAudioModelWithProjection",
             "ClapModel",
             "ClapPreTrainedModel",
+            "Clap_PRETRAINED_MODEL_ARCHIVE_LIST",
             "ClapTextModel",
             "ClapTextModelWithProjection",
         ]
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index e834cb11aadd..4aa476ae9382 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -22,9 +22,9 @@
 
 _import_structure = {
     "configuration_clap": [
-        "Clap_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "ClapAudioConfig",
         "ClapConfig",
+        "Clap_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "ClapTextConfig",
     ],
     "feature_extraction_clap": ["ClapFeatureExtractor"],
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 7470ec76ffc1..5060f6be4e57 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1464,7 +1464,11 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-Clap_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class Clap_PRETRAINED_MODEL_ARCHIVE_LIST(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 
 
 class ClapAudioModel(metaclass=DummyObject):

From fb24c4e25de3440bd52a9429e9d69c9336bfdcd6 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 10 Feb 2023 11:16:20 +0000
Subject: [PATCH 137/197] repo consistency

---
 src/transformers/__init__.py                  |  4 ++--
 src/transformers/models/clap/__init__.py      |  4 ++--
 src/transformers/models/clap/modeling_clap.py |  2 +-
 src/transformers/utils/dummy_pt_objects.py    |  6 +-----
 tests/models/clap/test_modeling_clap.py       | 12 ++++++------
 5 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 54773bf36c66..59ace4e73b51 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1232,11 +1232,11 @@
     )
     _import_structure["models.clap"].extend(
         [
+            "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
             "ClapAudioModel",
             "ClapAudioModelWithProjection",
             "ClapModel",
             "ClapPreTrainedModel",
-            "Clap_PRETRAINED_MODEL_ARCHIVE_LIST",
             "ClapTextModel",
             "ClapTextModelWithProjection",
         ]
@@ -4603,7 +4603,7 @@
             ChineseCLIPVisionModel,
         )
         from .models.clap import (
-            Clap_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
             ClapAudioModel,
             ClapAudioModelWithProjection,
             ClapModel,
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index 4aa476ae9382..4ca46a7a3ca9 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -39,7 +39,7 @@
     pass
 else:
     _import_structure["modeling_clap"] = [
-        "Clap_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
         "ClapModel",
         "ClapPreTrainedModel",
         "ClapTextModel",
@@ -66,7 +66,7 @@
         pass
     else:
         from .modeling_clap import (
-            Clap_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
             ClapAudioModel,
             ClapAudioModelWithProjection,
             ClapModel,
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 42cdb39a4fc2..65234595db84 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -45,7 +45,7 @@
 
 _CHECKPOINT_FOR_DOC = "laion-ai/clap-htsat-fused"
 
-Clap_PRETRAINED_MODEL_ARCHIVE_LIST = [
+CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "laion-ai/clap-htsat-fused",
     "laion-ai/clap-htsat-unfused",
     # See all clap models at https://huggingface.co/models?filter=clap
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 5060f6be4e57..6236b57a5170 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1464,11 +1464,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Clap_PRETRAINED_MODEL_ARCHIVE_LIST(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
+CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
 class ClapAudioModel(metaclass=DummyObject):
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 58d69a604bd6..20acefaf6fde 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -48,7 +48,7 @@
         ClapTextModel,
         ClapTextModelWithProjection,
     )
-    from transformers.models.clap.modeling_clap import Clap_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.models.clap.modeling_clap import CLAP_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
 class ClapAudioModelTester:
@@ -266,13 +266,13 @@ def test_save_load_fast_init_to_base(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in Clap_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = ClapAudioModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
     @slow
     def test_model_with_projection_from_pretrained(self):
-        for model_name in Clap_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = ClapAudioModelWithProjection.from_pretrained(model_name)
             self.assertIsNotNone(model)
             self.assertTrue(hasattr(model, "visual_projection"))
@@ -424,13 +424,13 @@ def test_save_load_fast_init_to_base(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in Clap_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = ClapTextModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
     @slow
     def test_model_with_projection_from_pretrained(self):
-        for model_name in Clap_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = ClapTextModelWithProjection.from_pretrained(model_name)
             self.assertIsNotNone(model)
             self.assertTrue(hasattr(model, "text_projection"))
@@ -608,7 +608,7 @@ def test_load_audio_text_config(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in Clap_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = ClapModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 

From bd29fec27ca0680271d40ea6dd7df683c0fdd14b Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 10 Feb 2023 11:17:36 +0000
Subject: [PATCH 138/197] again repo-consistency

---
 src/transformers/__init__.py                       | 4 ++--
 src/transformers/models/auto/configuration_auto.py | 2 +-
 src/transformers/models/clap/__init__.py           | 4 ++--
 src/transformers/models/clap/configuration_clap.py | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 59ace4e73b51..a2cd805521a3 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -210,7 +210,7 @@
         "ClapAudioConfig",
         "ClapConfig",
         "ClapFeatureExtractor",
-        "Clap_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
         "ClapProcessor",
         "ClapTextConfig",
     ],
@@ -3707,7 +3707,7 @@
         ChineseCLIPVisionConfig,
     )
     from .models.clap import (
-        Clap_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
         ClapAudioConfig,
         ClapConfig,
         ClapFeatureExtractor,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 5cffd6216387..3fa1fec59470 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -220,7 +220,7 @@
         ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("chinese_clip", "CHINESE_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("clap", "Clap_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("clap", "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST"),
         ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clipseg", "CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("codegen", "CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index 4ca46a7a3ca9..02ba6a991ca9 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -24,7 +24,7 @@
     "configuration_clap": [
         "ClapAudioConfig",
         "ClapConfig",
-        "Clap_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
         "ClapTextConfig",
     ],
     "feature_extraction_clap": ["ClapFeatureExtractor"],
@@ -50,7 +50,7 @@
 
 if TYPE_CHECKING:
     from .configuration_clap import (
-        Clap_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
         ClapAudioConfig,
         ClapConfig,
         ClapTextConfig,
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index dc27330ced6e..a707fcf1429b 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -24,7 +24,7 @@
 
 logger = logging.get_logger(__name__)
 
-Clap_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = {
     "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/config.json",
 }
 

From 5cf2fa80db0b53eb0c9a51d48ad5558a1b552267 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 10 Feb 2023 11:19:25 +0000
Subject: [PATCH 139/197] make fixup

---
 src/transformers/__init__.py             | 2 +-
 src/transformers/models/clap/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a2cd805521a3..5d6a6359ae76 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -207,10 +207,10 @@
         "ChineseCLIPVisionConfig",
     ],
     "models.clap": [
+        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
         "ClapAudioConfig",
         "ClapConfig",
         "ClapFeatureExtractor",
-        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
         "ClapProcessor",
         "ClapTextConfig",
     ],
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index 02ba6a991ca9..62ed3c0056d6 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -22,9 +22,9 @@
 
 _import_structure = {
     "configuration_clap": [
+        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
         "ClapAudioConfig",
         "ClapConfig",
-        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
         "ClapTextConfig",
     ],
     "feature_extraction_clap": ["ClapFeatureExtractor"],

From 90e4c03b8ecb04d481a87b876bb995474c38fd98 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 10 Feb 2023 15:52:38 +0100
Subject: [PATCH 140/197] Apply suggestions from code review

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 docs/source/en/model_doc/clap.mdx                       | 4 ++--
 src/transformers/models/clap/feature_extraction_clap.py | 8 ++++----
 src/transformers/models/clap/modeling_clap.py           | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/model_doc/clap.mdx b/docs/source/en/model_doc/clap.mdx
index 1e75135e572a..29211f3495d6 100644
--- a/docs/source/en/model_doc/clap.mdx
+++ b/docs/source/en/model_doc/clap.mdx
@@ -10,11 +10,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# clap
+# CLAP
 
 ## Overview
 
-The clap model was proposed in [Large Scale Constrastive Laungaue-Audio pretraining with
+The CLAP model was proposed in [Large Scale Constrastive Laungaue-Audio pretraining with
 feature fusion and keyword-to-caption augmentation](https://arxiv.org/pdf/2211.06687.pdf) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 
 The Clap model uses a SWINTransformer on the input fused mel spectrogram as the audio encoder, and a ROBerta model for the text emcoder.
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 7695f575d103..d3778520b551 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -42,7 +42,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
     Args:
         feature_size (`int`, defaults to 80):
             The feature dimension of the extracted MEL spectrograms. This corresponds to the number of frequency bins
-            (intervals) that are computer, for each fourrier step.
+            (intervals) that are computed, for each Fourier step.
         sampling_rate (`int`, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
             to warn users if the audio fed to the feature extractor does not have the same sampling rate.
@@ -73,7 +73,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
             of the original mel obtained from the padded audio.
                 - `rand_trunc` will select a random crop of the mel spectrogram.
         padding (`str`, *optional*):
-            Padding pattern for shorter audio inputs. Three patterns were originaly implemented:
+               Padding pattern for shorter audio inputs. Three patterns were originally implemented:
                 - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
                 - `repeat`: the audio is repeated and then cut to fit the `max_length`
                 - `pad`: the audio is padded.
@@ -152,7 +152,7 @@ def to_dict(self) -> Dict[str, Any]:
     def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
         """
         Compute the log-Mel spectrogram of the provided `waveform` using the `hanning` window. In Clap, two different
-        banks of filters are used depending on the truncation pattern:
+        filter banks are used depending on the truncation pattern:
             - `self.mel_filters`: they correspond to the defaults parameters of `torchaduio` which can be obtained from
               calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
               is set to `fuison`.
@@ -274,7 +274,7 @@ def __call__(
                 copy of the original mel obtained from the padded audio.
                     - `rand_trunc` will select a random crop of the mel spectrogram.
             padding (`str`, *optional*):
-                Padding pattern for shorter audio inputs. Three patterns were originaly implemented:
+               Padding pattern for shorter audio inputs. Three patterns were originally implemented:
                     - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
                     - `repeat`: the audio is repeated and then cut to fit the `max_length`
                     - `pad`: the audio is padded.
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 65234595db84..f9181756673b 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -913,7 +913,7 @@ def __init__(self, config):
         self.num_features = int(config.hidden_size * 2 ** (self.num_layers - 1))
         self.freq_ratio = config.spec_size // config.num_mel_bins
 
-        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
 
         self.input_resolutions = [(grid_size[0] // (2**i), grid_size[1] // (2**i)) for i in range(self.num_layers)]
 
@@ -928,7 +928,7 @@ def __init__(self, config):
                     drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                     downsample=ClapAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
                 )
-                for i_layer in range(self.num_layers)
+                for layer in range(self.num_layers)
             ]
         )
 

From 729fa517f3c4589a99818f1972cfc91b1bc1e2d1 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 13 Feb 2023 10:10:02 +0000
Subject: [PATCH 141/197] add config

---
 .../models/clap/configuration_clap.py         | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index a707fcf1429b..333fd94a7e52 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -173,7 +173,7 @@ class ClapAudioConfig(PretrainedConfig):
 
     Args:
         window_size (`int`, *optional*, defaults to 8):
-            [description]
+            Image size of the spectrogram
         num_mel_bins (`int`, *optional*, defaults to 64):
             Number of mel features used per frames. Should correspond to the value used in the `ClapProcessor` class.
         spec_size (`int`, *optional*, defaults to 256):
@@ -184,46 +184,46 @@ class ClapAudioConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"silu"` and `"gelu_new"` are supported.
         patch_size (`int`, *optional*, defaults to 4):
-            [description]
+            Patch size for the audio spectrogram
         patch_stride (`list`, *optional*, defaults to `[4, 4]`):
-            [description]
+            Patch stride for the audio spectrogram
         num_classes (`int`, *optional*, defaults to 527):
-            [description]
+            Number of classes used for the head training
         hidden_size (`int`, *optional*, defaults to 96):
-            [description]
+            Hidden size of the audio model
         projection_hidden_size (`int`, *optional*, defaults to 768):
-            [description]
+            Hidden size of the projection layer
         depths (`list`, *optional*, defaults to `[2, 2, 6, 2]`):
-            [description]
+            Depths used for the Swin Layers of the audio model
         num_attention_heads (`list`, *optional*, defaults to `[4, 8, 16, 32]`):
-            [description]
+            Number of attention heads used for the Swin Layers of the audio model
         enable_fusion (`bool`, *optional*, defaults to `False`):
             Whether or not to enable patch fusion. This is the main contribution of the authors, and should give the
             best results. Patch fusion will #TODO describe what it does
         hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
-            [description]
+            The dropout probabilitiy for all fully connected layers in the encoder.
         fusion_type (`[type]`, *optional*):
-            [description]
+            Fusion type used for the patch fusion. 
         patch_embed_input_channels (`int`, *optional*, defaults to 1):
-            [description]
+            Number of channels used for the input spectrogram
         flatten_patch_embeds (`bool`, *optional*, defaults to `True`):
-            [description]
+            Whether or not to flatten the patch embeddings
         patch_embeds_hidden_size (`int`, *optional*, defaults to 96):
-            [description]
+            Hidden size of the patch embeddings
         enable_patch_layer_norm (`bool`, *optional*, defaults to `True`):
-            [description]
+            Whether or not to enable layer normalization for the patch embeddings
         drop_path_rate (`float`, *optional*, defaults to 0.0):
-            [description]
+            Drop path rate for the patch fusion
         attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
-            [description]
+            The dropout ratio for the attention probabilities.
         qkv_bias (`bool`, *optional*, defaults to `True`):
-            [description]
+            Whether or not to add a bias to the query, key, value projections.
         mlp_ratio (`float`, *optional*, defaults to 4.0):
-            [description]
+            Ratio of the mlp hidden dim to embedding dim.
         aff_block_r (`int`, *optional*, defaults to 4):
-            [description]
+            downsize_ratio used in the AudioFF block
         num_hidden_layers (`int`, *optional*, defaults to 4):
-            [description]
+            Number of hidden layers in the Transformer encoder.
         projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
             The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
             `"relu"`, `"silu"` and `"gelu_new"` are supported.
@@ -348,13 +348,13 @@ class ClapConfig(PretrainedConfig):
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
             The inital value of the *logit_scale* paramter. Default is used as per the original Clap implementation.
         fusion_num_hidden_layers (`int`, *optional*, defaults to 2):
-            [description]
+            Number of hidden layers in the fusion layer.
         projection_dim (`int`, *optional*, defaults to 512):
-            [description]
+            Dimentionality of text and audio projection layers.
         projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
-            [description]
+            Activation function for the projection layers.
         initializer_factor (`float`, *optional*, defaults to 1.0):
-            [description]
+            Factor to scale the initialization of the model weights.
         kwargs (*optional*):
             Dictionary of keyword arguments.
 

From df70e2c42374a8b77f745a67676f57e5e595712f Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 13 Feb 2023 10:14:50 +0000
Subject: [PATCH 142/197] changes

---
 src/transformers/models/clap/configuration_clap.py | 10 +---------
 src/transformers/models/clap/modeling_clap.py      |  4 ++--
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 333fd94a7e52..393a32841642 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -203,7 +203,7 @@ class ClapAudioConfig(PretrainedConfig):
         hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the encoder.
         fusion_type (`[type]`, *optional*):
-            Fusion type used for the patch fusion. 
+            Fusion type used for the patch fusion.
         patch_embed_input_channels (`int`, *optional*, defaults to 1):
             Number of channels used for the input spectrogram
         flatten_patch_embeds (`bool`, *optional*, defaults to `True`):
@@ -398,14 +398,6 @@ def __init__(
     ):
         super().__init__(**kwargs)
 
-        # If `_config_dict` exist, we use them for the backward compatibility.
-        text_config_dict = kwargs.pop("text_config_dict", None)
-        audio_config_dict = kwargs.pop("audio_config_dict", None)
-        if text_config_dict is not None:
-            text_config = text_config_dict
-        if audio_config_dict is not None:
-            audio_config = audio_config_dict
-
         if text_config is None:
             text_config = {}
             logger.info("text_config is None. Initializing the ClapTextConfig with default values.")
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index f9181756673b..470e5fbb6f96 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -925,10 +925,10 @@ def __init__(self, config):
                     input_resolution=self.input_resolutions[i_layer],
                     depth=config.depths[i_layer],
                     num_heads=config.num_attention_heads[i_layer],
-                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    drop_path=drop_path_rate[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                     downsample=ClapAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
                 )
-                for layer in range(self.num_layers)
+                for i_layer in range(self.num_layers)
             ]
         )
 

From 7ebeacf6929e397996a987b3f9250325cdd630e0 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 13 Feb 2023 10:20:49 +0000
Subject: [PATCH 143/197] update conversion

---
 .../models/clap/convert_clap_original_pytorch_to_hf.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
index 979d18c71a91..21f79face787 100644
--- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+++ b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
@@ -17,7 +17,7 @@
 import re
 
 import torch
-from Clap import create_model
+from CLAP import create_model
 
 from transformers import AutoFeatureExtractor, ClapConfig, ClapModel
 
@@ -32,10 +32,9 @@
     "mlp.fc2": "output.dense",
     "norm1": "layernorm_before",
     "norm2": "layernorm_after",
-    # "bn0": "batch_norm",
 }
 
-processor = AutoFeatureExtractor.from_pretrained("ArthurZ/clap", truncation="rand_trunc")
+processor = AutoFeatureExtractor.from_pretrained("ybelkada/clap-htsat-unfused", truncation="rand_trunc")
 
 
 def init_clap(checkpoint_path, enable_fusion=False):
@@ -105,6 +104,7 @@ def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_pa
     transformers_config.audio_config.enable_fusion = enable_fusion
     model = ClapModel(transformers_config)
 
+    # ignore the spectrogram embedding layer
     model.load_state_dict(state_dict, strict=False)
 
     model.save_pretrained(pytorch_dump_folder_path)

From 3be208e2c0b9ea5f2481cee23439666e8d79f3c0 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 13 Feb 2023 11:21:54 +0100
Subject: [PATCH 144/197] Apply suggestions from code review

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/clap/modeling_clap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 470e5fbb6f96..565d40ba767d 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -61,7 +61,7 @@ def do_mixup(hidden_states, mixup_lambda):
     because the decision boundary becomes smooth.
 
     Args:
-        hidden_states (`torch.FloatTensor` of shape (batch_size, seq_length, hidden_size)) :
+        hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
             Input hidden states
         mixup_lambda (`torch.FloatTensor`):
             Mixing ratio sampled from the Beta distribution

From 505c623ce7168575f4c918e5595e2f809291bb03 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 13 Feb 2023 10:22:56 +0000
Subject: [PATCH 145/197] remove unused function

---
 src/transformers/models/clap/modeling_clap.py | 21 -------------------
 1 file changed, 21 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 565d40ba767d..63741aa07b00 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -52,27 +52,6 @@
 ]
 
 
-# Adapted from https://github.com/LAION-AI/Clap/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L176
-def do_mixup(hidden_states, mixup_lambda):
-    """
-    MIXUP is a data augmentation method, proposed by Hongyi Zhang et al on 25 Oct. 2017.
-    https://arxiv.org/abs/1710.09412 Based on the mixing ratio sampled from the Beta distribution, it is a method of
-    expanding data by mixing both input and output. By using this, it is said that generalization performance improves
-    because the decision boundary becomes smooth.
-
-    Args:
-        hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
-            Input hidden states
-        mixup_lambda (`torch.FloatTensor`):
-            Mixing ratio sampled from the Beta distribution
-    """
-    intermediate_hidden_states = hidden_states.transpose(0, -1) * mixup_lambda
-    flipped_hidden_states = torch.flip(hidden_states, dims=[0]).transpose(0, -1) * (1 - mixup_lambda)
-    out = intermediate_hidden_states + flipped_hidden_states
-    out = out.transpose(0, -1)
-    return out
-
-
 # Adapted from: https://github.com/LAION-AI/Clap/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L191
 def interpolate(hidden_states, ratio):
     """

From efb526ed960ec43334e91bf5f5df929060041315 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 10:38:48 +0000
Subject: [PATCH 146/197] update based on code reviews

---
 src/transformers/audio_utils.py               | 334 ++++++++++++++++++
 .../feature_extraction_sequence_utils.py      | 320 +----------------
 src/transformers/models/clap/__init__.py      |   4 -
 .../models/clap/feature_extraction_clap.py    |  20 +-
 src/transformers/models/clap/modeling_clap.py |  14 +-
 5 files changed, 364 insertions(+), 328 deletions(-)
 create mode 100644 src/transformers/audio_utils.py

diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
new file mode 100644
index 000000000000..8edd5eabb3d9
--- /dev/null
+++ b/src/transformers/audio_utils.py
@@ -0,0 +1,334 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Audio processing functions to extract feature from a raw audio. Should all be in numpy to support
+ all frameworks, and remmove unecessary dependencies.
+"""
+import math
+import warnings
+from typing import Optional
+
+import numpy as np
+from numpy.fft import fft
+
+
+def hertz_to_mel(freq: float, mel_scale: str = "htk") -> float:
+    """Convert Hertz to Mels.
+
+    Args:
+        freqs (`float`):
+            Frequencies in Hertz
+        mel_scale (`str`, *optional*, defaults to `"htk"`):
+            Scale to use, `htk` or `slaney`.
+
+    Returns:
+        mels (`float`):
+            Frequency in Mels
+    """
+
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+    if mel_scale == "htk":
+        return 2595.0 * math.log10(1.0 + (freq / 700.0))
+
+    # Fill in the linear part
+    frequency_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - frequency_min) / f_sp
+
+    # Fill in the log-scale part
+    min_log_hertz = 1000.0
+    min_log_mel = (min_log_hertz - frequency_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+
+    if freq >= min_log_hertz:
+        mels = min_log_mel + math.log(freq / min_log_hertz) / logstep
+
+    return mels
+
+
+@staticmethod
+def mel_to_hertz(mels: np.array, mel_scale: str = "htk") -> np.array:
+    """Convert mel bin numbers to frequencies.
+
+    Args:
+        mels (`np.array`):
+            Mel frequencies
+        mel_scale (`str`, *optional*, `"htk"`):
+            Scale to use: `htk` or `slaney`.
+
+    Returns:
+        freqs (`np.array`):
+            Mels converted in Hertz
+    """
+
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+    if mel_scale == "htk":
+        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+
+    # Fill in the linear scale
+    frequency_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = frequency_min + f_sp * mels
+
+    # And now the nonlinear scale
+    min_log_hertz = 1000.0
+    min_log_mel = (min_log_hertz - frequency_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+
+    log_t = mels >= min_log_mel
+    freqs[log_t] = min_log_hertz * np.exp(logstep * (mels[log_t] - min_log_mel))
+
+    return freqs
+
+
+@staticmethod
+def create_triangular_filterbank(
+    all_freqs: np.array,
+    f_pts: np.array,
+) -> np.array:
+    """Create a triangular filter bank.
+
+
+    Args:
+        all_freqs (`np.array`):
+            STFT freq points of size (`n_freqs`).
+        f_pts (`np.array`):
+            Filter mid points of size (`n_filter`).
+
+    Returns:
+        fb (np.array):
+            The filter bank of size (`n_freqs`, `n_filter`).
+    """
+    # Adapted from Librosa
+    # calculate the difference between each filter mid point and each stft freq point in hertz
+    f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
+    slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1)  # (n_freqs, n_filter + 2)
+    # create overlapping triangles
+    zero = np.zeros(1)
+    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_filter)
+    up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_filter)
+    fb = np.maximum(zero, np.minimum(down_slopes, up_slopes))
+
+    return fb
+
+
+def get_mel_filter_banks(
+    n_freqs: int,
+    frequency_min: float,
+    frequency_max: float,
+    n_mels: int,
+    sample_rate: int,
+    norm: Optional[str] = None,
+    mel_scale: str = "htk",
+) -> np.array:
+    """
+    Create a frequency bin conversion matrix used to obtain the Mel Spectrogram. This is called a *mel filter
+    bank*, and various implementation exist, which differ in the number of filters, the shape of the filters, the
+    way the filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The
+    goal of these features is to approximate the non-linear human perception of the variation in pitch with respect
+    to the frequency. This code is heavily inspired from the *torchaudio* implementation, see
+    [here](https://pytorch.org/audio/stable/transforms.html) for more details.
+
+
+    Note:
+        Different banks of Mel filters were introduced in the litterature. The following variation are supported:
+            - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHertz
+                and a speech bandwidth of `[0, 4600]` Hertz
+            - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a
+                speech bandwidth `[0, 8000]` Hertz (sampling rate ≥ 16 kHertz).
+            - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate
+                of 16 kHertz, and speech bandwidth [133, 6854] Hertz. This version also includes an area normalization.
+            - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes sampling
+                rate of 12.5 kHertz and speech bandwidth [0, 6250] Hertz
+        The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa`
+        uses the `"slaney"` implementation.
+
+    Args:
+        n_freqs (`int`):
+            Number of frequencies to highlight/apply.
+        frequency_min (`float`):
+            Minimum frequency of interest(Hertz).
+        frequency_max (`float`):
+            Maximum frequency of interest(Hertz).
+        n_mels (`int`):
+            Number of mel filterbanks.
+        sample_rate (`int`):
+            Sample rate of the audio waveform
+        norm (`str`, *optional*):
+            If "slaney", divide the triangular mel weights by the width of the mel band (area normalization).
+        mel_scale (`str`, *optional*, `"htk"`):
+            Scale to use: `htk` or `slaney`. (Default: `htk`)
+
+    Returns:
+        `numpy.ndarray`: Triangular filter banks (fb matrix) of size (`n_freqs`, `n_mels`) meaning number of
+        frequencies to highlight/apply to x the number of filterbanks. Each column is a filterbank so that assuming
+        there is a matrix A of size (..., `n_freqs`), the applied result would be `A * melscale_fbanks(A.size(-1),
+        ...)`.
+
+    """
+
+    if norm is not None and norm != "slaney":
+        raise ValueError('norm must be one of None or "slaney"')
+
+    # freq bins
+    all_freqs = np.linspace(0, sample_rate // 2, n_freqs)
+
+    # calculate mel freq bins
+    m_min = hertz_to_mel(frequency_min, mel_scale=mel_scale)
+    m_max = hertz_to_mel(frequency_max, mel_scale=mel_scale)
+
+    m_pts = np.linspace(m_min, m_max, n_mels + 2)
+    f_pts = mel_to_hertz(m_pts, mel_scale=mel_scale)
+
+    # create filterbank
+    filterbank = create_triangular_filterbank(all_freqs, f_pts)
+
+    if norm is not None and norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels])
+        filterbank *= np.expand_dims(enorm, 0)
+
+    if (filterbank.max(axis=0) == 0.0).any():
+        warnings.warn(
+            "At least one mel filterbank has all zero values. "
+            f"The value for `n_mels` ({n_mels}) may be set too high. "
+            f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
+        )
+
+    return filterbank
+
+
+def _stft(frames: np.array, window: np.array, fft_size: int = None):
+    """
+    Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
+    results as `torch.stft`. #TODO @Arthur batching this could allow more usage, good first issue.
+
+    Args:
+        frames (`np.array` of dimension `(num_frames, self.n_fft)`):
+            A framed audio signal obtained using `self._fram_wav`.
+        window (`np.array` of dimension `(self.n_freqs, self.n_mels)`:
+            A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at
+            the boundaries of each frame when computing the FFT. Each frame will be multiplied by the window. For
+            more information on this phenomena, called *Spectral leakage*, refer to [this
+            tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
+        fft_size (`int`, *optional*):
+            Defines the frequency resolution of the Fourier Transform. The number of frequency bins used for dividing the window into equal strips
+            A bin is a spectrum sample, and defines the frequency resolution of the window. An increase of the FFT size slows the calculus time proportionnally.
+    """
+    frame_size = frames.shape[1]
+
+    if fft_size is None:
+        fft_size = frame_size
+
+    if fft_size < frame_size:
+        raise ValueError("FFT size must greater or equal the frame size")
+    # number of FFT bins to store
+    num_fft_bins = (fft_size >> 1) + 1
+
+    data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
+    fft_signal = np.zeros(fft_size)
+
+    for f, frame in enumerate(frames):
+        if window is not None:
+            np.multiply(frame, window, out=fft_signal[:frame_size])
+        else:
+            fft_signal[:frame_size] = frame
+        data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
+    return data.T
+
+
+def _power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
+    """
+    Convert a mel spectrogram from power to db scale, this function is the numpy implementation of
+    librosa.power_to_lb.
+
+    Note:
+        The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness
+        on a linear scale. Generally to double the percieved volume of a sound we need to put 8 times as much
+        energy into it. This means that large variations in energy may not sound all that different if the sound is
+        loud to begin with. This compression operation makes the mel features match more closely what humans
+        actually hear.
+
+    Args:
+        mel_spectrogram (`np.array`):
+            Input mel spectrogram.
+        top_db (`int`, *optional*):
+            The maximum decibel value.
+        a_min (`int`, *optional*, default to 1e-10):
+            TODO
+        ref (`float`, *optional*, default to 1.0):
+            TODO
+
+    """
+    log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
+    log_spec -= 10.0 * np.log10(np.maximum(a_min, ref))
+    if top_db is not None:
+        if top_db < 0:
+            raise ValueError("top_db must be non-negative")
+        log_spec = np.clip(log_spec, min=np.maximum(log_spec) - top_db, max=np.inf)
+    return log_spec
+
+
+def _fram_wave(waveform: np.array, hop_length: int = 160, n_fft: int = 400, center: bool = True):
+    """
+    In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
+    segments called `frames`.
+
+    The window length (window_length) defines how much of the signal is contained in each frame, while the hop
+    length defines the step between the beginning of each new frame.
+
+    #TODO @Arthur **This method does not support batching yet as we are mainly focus on inference. If you want this
+    to be added feel free to open an issue and ping @arthurzucker on Github**
+
+    Args:
+        waveform (`np.array`) of shape (sample_length,):
+            The raw waveform which will be split into smaller chunks.
+        center (`bool`, defaults to `True`):
+            Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the
+            waveform on the left and on the right.
+
+    Return:
+        framed_waveform (`np.array` of shape (`waveform.shape // hop_length , n_fft)`):
+            The framed waveforms that can be fed to `np.fft`.
+    """
+    frames = []
+    for i in range(0, waveform.shape[0] + 1, hop_length):
+        half_window = (n_fft - 1) // 2 + 1
+        if center:
+            start = i - half_window if i > half_window else 0
+            end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
+            frame = waveform[start:end]
+            if start == 0:
+                padd_width = (-i + half_window, 0)
+                frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+            elif end == waveform.shape[0]:
+                padd_width = (0, (i - waveform.shape[0] + half_window))
+                frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+        else:
+            frame = waveform[i : i + n_fft]
+            frame_width = frame.shape[0]
+            if frame_width < waveform.shape[0]:
+                frame = np.lib.pad(frame, pad_width=(0, n_fft - frame_width), mode="constant", constant_values=0)
+        frames.append(frame)
+
+    frames = np.stack(frames, 0)
+    return frames
diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 72f76af53b81..1a241c0d7c80 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -15,12 +15,9 @@
 """
  Sequence feature extraction class for common feature extractors to preprocess sequences.
 """
-import math
-import warnings
 from typing import Dict, List, Optional, Union
 
 import numpy as np
-from numpy.fft import fft
 
 from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from .utils import PaddingStrategy, TensorType, is_tf_tensor, is_torch_tensor, logging, to_numpy
@@ -238,11 +235,13 @@ def _pad(
         Pad inputs (on left/right and up to predefined length or max length in the batch)
 
         Args:
-            processed_features:
+            processed_features (`Union[Dict[str, np.ndarray], BatchFeature]`):
                 Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
                 of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
-            max_length: maximum length of the returned list and optionally padding length (see below)
-            padding_strategy: PaddingStrategy to use for padding.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see below)
+            padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`):
+                PaddingStrategy to use for padding.
 
                 - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                 - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
@@ -251,11 +250,12 @@ def _pad(
 
                     - 'left': pads on the left of the sequences
                     - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+            pad_to_multiple_of (`int`, *optional*):
+                Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+            return_attention_mask (`bool`, *optional*):
+                Set to False to avoid returning attention mask (default: set to model specifics)
         """
         required_input = processed_features[self.model_input_names[0]]
 
@@ -306,16 +306,16 @@ def _truncate(
         Truncate inputs to predefined length or max length in the batch
 
         Args:
-            processed_features:
+            processed_features(`Union[Dict[str, np.ndarray], BatchFeature]`):
                 Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
                 of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
-            max_length:
+            max_length (`int`, *optional*):
                 maximum length of the returned list and optionally padding length (see below)
-            pad_to_multiple_of (optional) :
+            pad_to_multiple_of (`int`, *optional*) :
                 Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
                 enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
                 which benefit from having sequence lengths be a multiple of 128.
-            truncation (optional):
+            truncation (`bool`, *optional*):
                 Activates truncation to cut input sequences longer than `max_length` to `max_length`.
         """
         if not truncation:
@@ -369,297 +369,3 @@ def _get_padding_strategies(self, padding=False, max_length=None):
             )
 
         return padding_strategy
-
-    @staticmethod
-    def hz_to_mel(freq: float, mel_scale: str = "htk") -> float:
-        """Convert Hz to Mels.
-
-        Args:
-            freqs (`float`):
-                Frequencies in Hz
-            mel_scale (`str`, *optional*, defaults to `"htk"`):
-                Scale to use, `htk` or `slaney`.
-
-        Returns:
-            mels (`float`):
-                Frequency in Mels
-        """
-
-        if mel_scale not in ["slaney", "htk"]:
-            raise ValueError('mel_scale should be one of "htk" or "slaney".')
-
-        if mel_scale == "htk":
-            return 2595.0 * math.log10(1.0 + (freq / 700.0))
-
-        # Fill in the linear part
-        frequency_min = 0.0
-        f_sp = 200.0 / 3
-
-        mels = (freq - frequency_min) / f_sp
-
-        # Fill in the log-scale part
-        min_log_hz = 1000.0
-        min_log_mel = (min_log_hz - frequency_min) / f_sp
-        logstep = math.log(6.4) / 27.0
-
-        if freq >= min_log_hz:
-            mels = min_log_mel + math.log(freq / min_log_hz) / logstep
-
-        return mels
-
-    @staticmethod
-    def mel_to_hz(mels: np.array, mel_scale: str = "htk") -> np.array:
-        """Convert mel bin numbers to frequencies.
-
-        Args:
-            mels (`np.array`):
-                Mel frequencies
-            mel_scale (`str`, *optional*, `"htk"`):
-                Scale to use: `htk` or `slaney`.
-
-        Returns:
-            freqs (`np.array`):
-                Mels converted in Hz
-        """
-
-        if mel_scale not in ["slaney", "htk"]:
-            raise ValueError('mel_scale should be one of "htk" or "slaney".')
-
-        if mel_scale == "htk":
-            return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
-
-        # Fill in the linear scale
-        frequency_min = 0.0
-        f_sp = 200.0 / 3
-        freqs = frequency_min + f_sp * mels
-
-        # And now the nonlinear scale
-        min_log_hz = 1000.0
-        min_log_mel = (min_log_hz - frequency_min) / f_sp
-        logstep = math.log(6.4) / 27.0
-
-        log_t = mels >= min_log_mel
-        freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
-
-        return freqs
-
-    @staticmethod
-    def create_triangular_filterbank(
-        all_freqs: np.array,
-        f_pts: np.array,
-    ) -> np.array:
-        """Create a triangular filter bank.
-
-
-        Args:
-            all_freqs (`np.array`):
-                STFT freq points of size (`n_freqs`).
-            f_pts (`np.array`):
-                Filter mid points of size (`n_filter`).
-
-        Returns:
-            fb (np.array):
-                The filter bank of size (`n_freqs`, `n_filter`).
-        """
-        # Adapted from Librosa
-        # calculate the difference between each filter mid point and each stft freq point in hertz
-        f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
-        slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1)  # (n_freqs, n_filter + 2)
-        # create overlapping triangles
-        zero = np.zeros(1)
-        down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_filter)
-        up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_filter)
-        fb = np.maximum(zero, np.minimum(down_slopes, up_slopes))
-
-        return fb
-
-    def get_mel_filter_banks(
-        self,
-        n_freqs: int,
-        frequency_min: float,
-        frequency_max: float,
-        n_mels: int,
-        sample_rate: int,
-        norm: Optional[str] = None,
-        mel_scale: str = "htk",
-    ) -> np.array:
-        """
-        Create a frequency bin conversion matrix used to obtain the Mel Spectrogram. This is called a *mel filter
-        bank*, and various implementation exist, which differ in the number of filters, the shape of the filters, the
-        way the filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The
-        goal of these features is to approximate the non-linear human perception of the variation in pitch with respect
-        to the frequency. This code is heavily inspired from the *torchaudio* implementation, see
-        [here](https://pytorch.org/audio/stable/transforms.html) for more details.
-
-
-        Note:
-            Different banks of MEL filters were introduced in the litterature. The following variation are supported:
-                - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz
-                  and a speech bandwidth of `[0, 4600]` Hz
-                - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a
-                  speech bandwidth `[0, 8000]` Hz (sampling rate ≥ 16 kHz).
-                - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate
-                  of 16 kHz, and speech bandwidth [133, 6854] Hz. This version also includes an area normalization.
-                - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes sampling
-                  rate of 12.5 kHz and speech bandwidth [0, 6250] Hz
-            The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa`
-            uses the `"slaney"` implementation.
-
-        Args:
-            n_freqs (`int`):
-                Number of frequencies to highlight/apply.
-            frequency_min (`float`):
-                Minimum frequency of interest(Hz).
-            frequency_max (`float`):
-                Maximum frequency of interest(Hz).
-            n_mels (`int`):
-                Number of mel filterbanks.
-            sample_rate (`int`):
-                Sample rate of the audio waveform
-            norm (`str`, *optional*):
-                If "slaney", divide the triangular mel weights by the width of the mel band (area normalization).
-            mel_scale (`str`, *optional*, `"htk"`):
-                Scale to use: `htk` or `slaney`. (Default: `htk`)
-
-        Returns:
-            `numpy.ndarray`: Triangular filter banks (fb matrix) of size (`n_freqs`, `n_mels`) meaning number of
-            frequencies to highlight/apply to x the number of filterbanks. Each column is a filterbank so that assuming
-            there is a matrix A of size (..., `n_freqs`), the applied result would be `A * melscale_fbanks(A.size(-1),
-            ...)`.
-
-        """
-
-        if norm is not None and norm != "slaney":
-            raise ValueError('norm must be one of None or "slaney"')
-
-        # freq bins
-        all_freqs = np.linspace(0, sample_rate // 2, n_freqs)
-
-        # calculate mel freq bins
-        m_min = self.hz_to_mel(frequency_min, mel_scale=mel_scale)
-        m_max = self.hz_to_mel(frequency_max, mel_scale=mel_scale)
-
-        m_pts = np.linspace(m_min, m_max, n_mels + 2)
-        f_pts = self.mel_to_hz(m_pts, mel_scale=mel_scale)
-
-        # create filterbank
-        filterbank = self.create_triangular_filterbank(all_freqs, f_pts)
-
-        if norm is not None and norm == "slaney":
-            # Slaney-style mel is scaled to be approx constant energy per channel
-            enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels])
-            filterbank *= np.expand_dims(enorm, 0)
-
-        if (filterbank.max(axis=0) == 0.0).any():
-            warnings.warn(
-                "At least one mel filterbank has all zero values. "
-                f"The value for `n_mels` ({n_mels}) may be set too high. "
-                f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
-            )
-
-        return filterbank
-
-    def _stft(self, frames, window):
-        """
-        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
-        results as `torch.stft`. #TODO @Arthur batching this could allow more usage, good first issue.
-
-        Args:
-            frames (`np.array` of dimension `(num_frames, self.n_fft)`):
-                A framed audio signal obtained using `self._fram_wav`.
-            window (`np.array` of dimension `(self.n_freqs, self.n_mels)`:
-                A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at
-                the boundaries of each frame when computing the FFT. Each frame will be multiplied by the window. For
-                more information on this phenomena, called *Spectral leakage*, refer to [this
-                tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
-        """
-        frame_size = frames.shape[1]
-        fft_size = self.n_fft
-
-        if fft_size is None:
-            fft_size = frame_size
-
-        if fft_size < frame_size:
-            raise ValueError("FFT size must greater or equal the frame size")
-        # number of FFT bins to store
-        num_fft_bins = (fft_size >> 1) + 1
-
-        data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
-        fft_signal = np.zeros(fft_size)
-
-        for f, frame in enumerate(frames):
-            if window is not None:
-                np.multiply(frame, window, out=fft_signal[:frame_size])
-            else:
-                fft_signal[:frame_size] = frame
-            data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
-        return data.T
-
-    def _power_to_db(self, mel_spectrogram, a_min=1e-10, ref=1.0):
-        """
-        Convert a mel spectrogram from power to db scale, this function is the numpy implementation of
-        librosa.power_to_lb.
-
-        Note:
-            The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness
-            on a linear scale. Generally to double the percieved volume of a sound we need to put 8 times as much
-            energy into it. This means that large variations in energy may not sound all that different if the sound is
-            loud to begin with. This compression operation makes the mel features match more closely what humans
-            actually hear.
-        """
-        log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
-        log_spec -= 10.0 * np.log10(np.maximum(a_min, ref))
-        if self.top_db is not None:
-            if self.top_db < 0:
-                raise ValueError("top_db must be non-negative")
-            log_spec = np.clip(log_spec, min=np.maximum(log_spec) - self.top_db, max=np.inf)
-        return log_spec
-
-    def _fram_wave(self, waveform: np.array, center: bool = True):
-        """
-        In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
-        segments called `frames`.
-
-        The window length (self.window_length) defines how much of the signal is contained in each frame, while the hop
-        length defines the step between the beginning of each new frame.
-
-        #TODO @Arthur **This method does not support batching yet as we are mainly focus on inference. If you want this
-        to be added feel free to open an issue and ping @arthurzucker on Github**
-
-        Args:
-            waveform (`np.array`) of shape (sample_length,):
-                The raw waveform which will be split into smaller chunks.
-            center (`bool`, defaults to `True`):
-                Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the
-                waveform on the left and on the right.
-
-        Return:
-            framed_waveform (`np.array` of shape (`waveform.shape // self.hop_length , self.n_fft)`):
-                The framed waveforms that can be fed to `np.fft`.
-        """
-        frames = []
-        for i in range(0, waveform.shape[0] + 1, self.hop_length):
-            half_window = (self.n_fft - 1) // 2 + 1
-            if center:
-                start = i - half_window if i > half_window else 0
-                end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
-                frame = waveform[start:end]
-                if start == 0:
-                    padd_width = (-i + half_window, 0)
-                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
-
-                elif end == waveform.shape[0]:
-                    padd_width = (0, (i - waveform.shape[0] + half_window))
-                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
-
-            else:
-                frame = waveform[i : i + self.n_fft]
-                frame_width = frame.shape[0]
-                if frame_width < waveform.shape[0]:
-                    frame = np.lib.pad(
-                        frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0
-                    )
-            frames.append(frame)
-
-        frames = np.stack(frames, 0)
-        return frames
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index 62ed3c0056d6..e7bd202888da 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index d3778520b551..b5b3e0a71ba8 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -37,11 +37,11 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
     refer to this superclass for more information regarding those methods.
 
     This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
-    Fourier Transform` which should match pytorch's `torch.stft` equivalent.
+    Fourier Transform` (STFT) which should match pytorch's `torch.stft` equivalent.
 
     Args:
         feature_size (`int`, defaults to 80):
-            The feature dimension of the extracted MEL spectrograms. This corresponds to the number of frequency bins
+            The feature dimension of the extracted Mel spectrograms. This corresponds to the number of frequency bins
             (intervals) that are computed, for each Fourier step.
         sampling_rate (`int`, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
@@ -49,23 +49,23 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
         hop_length (`int`, defaults to 160):
             Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
             in smaller `frames` with a step of `hop_length` between each frame.
-        chunk_length_s (`int`, defaults to 10):
+        max_length_s (`int`, defaults to 10):
             The maximum input lenght of the model in seconds. This is used to pad the audio.
         n_fft (`int`, defaults to 400):
             Size of the Fourier transform. This should be the length of a single frame in samples. 400 means that the
             fourrier transform is computed on 400 samples.
         padding_value (`float`, *optional*, defaults to 0.0):
             Padding value used to pad the audio. Should correspond to silences.
-        return_attention_mask (`bool`, *optional*, False):
+        return_attention_mask (`bool`, *optional*, defaults to `False`):
             Whether or not the model should return the attention masks coresponding to the input.
-        frequency_min (`float`, *optional*, 0):
+        frequency_min (`float`, *optional*, default to 0):
             The lowest frequency of interest. The STFT will not be computed for values below this.
-        frequency_max (`float`, *optional*, 14_000):
+        frequency_max (`float`, *optional*, default to 14_000):
             The highest frequency of interest. The STFT will not be computed for values above this.
         top_db (`float`, *optional*):
             The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the
             `SequenceFeatureExtractor._power_to_db` function
-        truncation (`str`, *optional*, `"fusions"`):
+        truncation (`str`, *optional*, default to `"fusions"`):
             Truncation pattern for long audio inputs. Two patterns are available:
                 - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a
                   downsampled version of the entire mel spectrogram.
@@ -86,7 +86,7 @@ def __init__(
         feature_size=80,
         sampling_rate=48_000,
         hop_length=480,
-        chunk_length_s=10,
+        max_length_s=10,
         n_fft=400,
         padding_value=0.0,
         return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
@@ -109,8 +109,8 @@ def __init__(
         self.padding = padding
         self.n_fft = n_fft
         self.hop_length = hop_length
-        self.chunk_length_s = chunk_length_s
-        self.nb_max_samples = chunk_length_s * sampling_rate
+        self.max_length_s = max_length_s
+        self.nb_max_samples = max_length_s * sampling_rate
         self.sampling_rate = sampling_rate
         self.frequency_min = frequency_min
         self.frequency_max = frequency_max
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index f9181756673b..9079e097edfb 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -913,7 +913,7 @@ def __init__(self, config):
         self.num_features = int(config.hidden_size * 2 ** (self.num_layers - 1))
         self.freq_ratio = config.spec_size // config.num_mel_bins
 
-        drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        drop_path = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
 
         self.input_resolutions = [(grid_size[0] // (2**i), grid_size[1] // (2**i)) for i in range(self.num_layers)]
 
@@ -921,12 +921,12 @@ def __init__(self, config):
             [
                 ClapAudioStage(
                     config=config,
-                    dim=int(config.hidden_size * 2**i_layer),
-                    input_resolution=self.input_resolutions[i_layer],
-                    depth=config.depths[i_layer],
-                    num_heads=config.num_attention_heads[i_layer],
-                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
-                    downsample=ClapAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
+                    dim=int(config.hidden_size * 2**layer),
+                    input_resolution=self.input_resolutions[layer],
+                    depth=config.depths[layer],
+                    num_heads=config.num_attention_heads[layer],
+                    drop_path=drop_path,
+                    downsample=ClapAudioPatchMerging if (layer < self.num_layers - 1) else None,
                 )
                 for layer in range(self.num_layers)
             ]

From 2d57cfc657b86a5d78d1069f5b138efbb6ccb83b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 10:40:15 +0000
Subject: [PATCH 147/197] style

---
 src/transformers/feature_extraction_sequence_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 1a241c0d7c80..831d30e39026 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -251,9 +251,9 @@ def _pad(
                     - 'left': pads on the left of the sequences
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of (`int`, *optional*):
-                Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+                Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
+                enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
+                which benefit from having sequence lengths be a multiple of 128.
             return_attention_mask (`bool`, *optional*):
                 Set to False to avoid returning attention mask (default: set to model specifics)
         """

From 441e24760647a6ecac919ec1e6e0a65219384434 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 13 Feb 2023 10:40:47 +0000
Subject: [PATCH 148/197] more comments

---
 src/transformers/models/clap/modeling_clap.py | 31 ++++++++++++-------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 63741aa07b00..a39ff5f89834 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -170,12 +170,14 @@ class ClapAudioModelOutput(ModelOutput):
     ClapAudio model output to mimic the output of the original implementation.
 
     Args:
-        framewise_output (`torch.FloatTensor` of shape `(batch_size, num_frames, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        clipwise_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        fine_grained_embedding (`torch.FloatTensor` of shape `(batch_size, num_frames, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
+        framewise_output (`torch.FloatTensor` of shape `(batch_size, reshaped_hidden_size, num_classes)`):
+            Output hidden_states that are interpolated after applying sigmoid. These logits are used to compute the
+            the classification label in the original implementation.
+        clipwise_output (`torch.FloatTensor` of shape `(batch_size, hidden_size, num_classes)`):
+            Output hidden_states after applying sigmoid. These logits are used to compute the
+            the classification label in the original implementation.
+        fine_grained_embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+            Pooled interpolated hidden_states.
         embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
@@ -207,12 +209,14 @@ class ClapAudioModelOutputWithProjection(ModelOutput):
     Args:
         audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
             The audio embeddings obtained by applying the projection layer to the pooler_output.
-        framewise_output (`torch.FloatTensor` of shape `(batch_size, num_frames, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        clipwise_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        fine_grained_embedding (`torch.FloatTensor` of shape `(batch_size, num_frames, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
+        framewise_output (`torch.FloatTensor` of shape `(batch_size, reshaped_hidden_size, num_classes)`):
+            Output hidden_states that are interpolated after applying sigmoid. These logits are used to compute the
+            the classification label in the original implementation.
+        clipwise_output (`torch.FloatTensor` of shape `(batch_size, hidden_size, num_classes)`):
+            Output hidden_states after applying sigmoid. These logits are used to compute the
+            the classification label in the original implementation.
+        fine_grained_embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+            Pooled interpolated hidden_states.
         embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
@@ -1175,6 +1179,9 @@ def custom_forward(*inputs):
         input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
             retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
+        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
+            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
+            the features.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.

From 51b3ddedc610d0f8abcb2a7fc33b3a59c9405774 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 13 Feb 2023 10:42:50 +0000
Subject: [PATCH 149/197] cleanup

---
 src/transformers/models/clap/modeling_clap.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index a39ff5f89834..a3d914e7cf4f 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1174,7 +1174,7 @@ def custom_forward(*inputs):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
-Clap_AUDIO_INPUTS_DOCSTRING = r"""
+CLAP_AUDIO_INPUTS_DOCSTRING = r"""
     Args:
         input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
@@ -1192,7 +1192,7 @@ def custom_forward(*inputs):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
-Clap_INPUTS_DOCSTRING = r"""
+CLAP_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -1888,7 +1888,7 @@ def __init__(self, config: ClapAudioConfig):
     def get_input_embeddings(self) -> nn.Module:
         return self.audio_encoder.patch_embed.proj
 
-    @add_start_docstrings_to_model_forward(Clap_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ClapAudioConfig)
     def forward(
         self,
@@ -2190,7 +2190,7 @@ def get_text_features(
 
         return text_features
 
-    @add_start_docstrings_to_model_forward(Clap_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
     def get_audio_features(
         self,
         input_features: Optional[torch.Tensor] = None,
@@ -2220,7 +2220,7 @@ def get_audio_features(
 
         return audio_features
 
-    @add_start_docstrings_to_model_forward(Clap_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=ClapOutput, config_class=ClapConfig)
     def forward(
         self,
@@ -2414,7 +2414,7 @@ def __init__(self, config: ClapAudioConfig):
     def get_input_embeddings(self) -> nn.Module:
         return self.audio_model.audio_encoder.patch_embed.proj
 
-    @add_start_docstrings_to_model_forward(Clap_AUDIO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=ClapAudioModelOutput, config_class=ClapAudioConfig)
     def forward(
         self,

From 8d5e5ad34bec36d2492cd95d04f4a5be1d16c4dd Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 13 Feb 2023 10:45:57 +0000
Subject: [PATCH 150/197] clean up

---
 src/transformers/models/clap/configuration_clap.py | 4 ++--
 src/transformers/models/clap/modeling_clap.py      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 393a32841642..d1eb6151def8 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -31,7 +31,7 @@
 
 class ClapTextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ClapTextModel`] or a [`TFClapTextModel`]. It is
+    This is the configuration class to store the configuration of a [`ClapTextModel`]. It is
     used to instantiate a RoBERTa model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
     [roberta-base](https://huggingface.co/roberta-base) architecture.
@@ -63,7 +63,7 @@ class ClapTextConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`] or [`TFClapTextModel`].
+            The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`].
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index a3d914e7cf4f..39115ba01164 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -936,7 +936,7 @@ def __init__(self, config):
     def reshape_mel2img(self, normalixed_input_features):
         """
         The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
-        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the `ClapFeatureExtracor`.
+        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
         """
         _, _, time_steps, freq_steps = normalixed_input_features.shape
 

From 38ce551c63df048f7a993ffdd57e098f2414f23e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 10:46:04 +0000
Subject: [PATCH 151/197] style

---
 docs/source/en/model_doc/clap.mdx             |  2 +-
 src/transformers/audio_utils.py               | 53 +++++++++----------
 .../models/clap/feature_extraction_clap.py    | 14 ++---
 3 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/docs/source/en/model_doc/clap.mdx b/docs/source/en/model_doc/clap.mdx
index 29211f3495d6..fa9abacbafe6 100644
--- a/docs/source/en/model_doc/clap.mdx
+++ b/docs/source/en/model_doc/clap.mdx
@@ -17,7 +17,7 @@ specific language governing permissions and limitations under the License.
 The CLAP model was proposed in [Large Scale Constrastive Laungaue-Audio pretraining with
 feature fusion and keyword-to-caption augmentation](https://arxiv.org/pdf/2211.06687.pdf) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 
-The Clap model uses a SWINTransformer on the input fused mel spectrogram as the audio encoder, and a ROBerta model for the text emcoder.
+CLAP (Constrastive Laungaue-Audio Pretraining) is a neural network trained on a variety of (audio, text) pairs. It can be instructed in to predict the most relevant text snippet, given an audio, without directly optimizing for the task. The CLAP model uses a SWINTransformer to get audio features from a log-Mel spectrogram input, and a RoBERTa model to get text features. Both the text and audio features are then projected to a latent space with identical dimension. The dot product between the projected audio and text features is then used as a similar score.
 
 The abstract from the paper is the following:
 
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index 8edd5eabb3d9..58f73761babc 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -13,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Audio processing functions to extract feature from a raw audio. Should all be in numpy to support
- all frameworks, and remmove unecessary dependencies.
+ Audio processing functions to extract feature from a raw audio. Should all be in numpy to support all frameworks, and
+ remmove unecessary dependencies.
 """
 import math
 import warnings
@@ -139,11 +139,11 @@ def get_mel_filter_banks(
     mel_scale: str = "htk",
 ) -> np.array:
     """
-    Create a frequency bin conversion matrix used to obtain the Mel Spectrogram. This is called a *mel filter
-    bank*, and various implementation exist, which differ in the number of filters, the shape of the filters, the
-    way the filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The
-    goal of these features is to approximate the non-linear human perception of the variation in pitch with respect
-    to the frequency. This code is heavily inspired from the *torchaudio* implementation, see
+    Create a frequency bin conversion matrix used to obtain the Mel Spectrogram. This is called a *mel filter bank*,
+    and various implementation exist, which differ in the number of filters, the shape of the filters, the way the
+    filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
+    features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
+    This code is heavily inspired from the *torchaudio* implementation, see
     [here](https://pytorch.org/audio/stable/transforms.html) for more details.
 
 
@@ -157,8 +157,8 @@ def get_mel_filter_banks(
                 of 16 kHertz, and speech bandwidth [133, 6854] Hertz. This version also includes an area normalization.
             - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes sampling
                 rate of 12.5 kHertz and speech bandwidth [0, 6250] Hertz
-        The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa`
-        uses the `"slaney"` implementation.
+        The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa` uses
+        the `"slaney"` implementation.
 
     Args:
         n_freqs (`int`):
@@ -217,20 +217,21 @@ def get_mel_filter_banks(
 
 def _stft(frames: np.array, window: np.array, fft_size: int = None):
     """
-    Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
-    results as `torch.stft`. #TODO @Arthur batching this could allow more usage, good first issue.
+    Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
+    as `torch.stft`. #TODO @Arthur batching this could allow more usage, good first issue.
 
     Args:
         frames (`np.array` of dimension `(num_frames, self.n_fft)`):
             A framed audio signal obtained using `self._fram_wav`.
         window (`np.array` of dimension `(self.n_freqs, self.n_mels)`:
-            A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at
-            the boundaries of each frame when computing the FFT. Each frame will be multiplied by the window. For
-            more information on this phenomena, called *Spectral leakage*, refer to [this
+            A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at the
+            boundaries of each frame when computing the FFT. Each frame will be multiplied by the window. For more
+            information on this phenomena, called *Spectral leakage*, refer to [this
             tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
         fft_size (`int`, *optional*):
-            Defines the frequency resolution of the Fourier Transform. The number of frequency bins used for dividing the window into equal strips
-            A bin is a spectrum sample, and defines the frequency resolution of the window. An increase of the FFT size slows the calculus time proportionnally.
+            Defines the frequency resolution of the Fourier Transform. The number of frequency bins used for dividing
+            the window into equal strips A bin is a spectrum sample, and defines the frequency resolution of the
+            window. An increase of the FFT size slows the calculus time proportionnally.
     """
     frame_size = frames.shape[1]
 
@@ -256,15 +257,13 @@ def _stft(frames: np.array, window: np.array, fft_size: int = None):
 
 def _power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
     """
-    Convert a mel spectrogram from power to db scale, this function is the numpy implementation of
-    librosa.power_to_lb.
+    Convert a mel spectrogram from power to db scale, this function is the numpy implementation of librosa.power_to_lb.
 
     Note:
-        The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness
-        on a linear scale. Generally to double the percieved volume of a sound we need to put 8 times as much
-        energy into it. This means that large variations in energy may not sound all that different if the sound is
-        loud to begin with. This compression operation makes the mel features match more closely what humans
-        actually hear.
+        The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness on a
+        linear scale. Generally to double the percieved volume of a sound we need to put 8 times as much energy into
+        it. This means that large variations in energy may not sound all that different if the sound is loud to begin
+        with. This compression operation makes the mel features match more closely what humans actually hear.
 
     Args:
         mel_spectrogram (`np.array`):
@@ -291,11 +290,11 @@ def _fram_wave(waveform: np.array, hop_length: int = 160, n_fft: int = 400, cent
     In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
     segments called `frames`.
 
-    The window length (window_length) defines how much of the signal is contained in each frame, while the hop
-    length defines the step between the beginning of each new frame.
+    The window length (window_length) defines how much of the signal is contained in each frame, while the hop length
+    defines the step between the beginning of each new frame.
 
-    #TODO @Arthur **This method does not support batching yet as we are mainly focus on inference. If you want this
-    to be added feel free to open an issue and ping @arthurzucker on Github**
+    #TODO @Arthur **This method does not support batching yet as we are mainly focus on inference. If you want this to
+    be added feel free to open an issue and ping @arthurzucker on Github**
 
     Args:
         waveform (`np.array`) of shape (sample_length,):
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index b5b3e0a71ba8..51c04710ec1e 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -22,9 +22,9 @@
 
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
-from ...image_transforms import np_bilinear_resize
+from ...image_transforms import np_bilinear_resize #TODO this has to be removed
 from ...utils import TensorType, logging
-
+from ...audio_utils import get_mel_filter_banks, _fram_wave, _power_to_db, _stft
 
 logger = logging.get_logger(__name__)
 
@@ -114,7 +114,7 @@ def __init__(
         self.sampling_rate = sampling_rate
         self.frequency_min = frequency_min
         self.frequency_max = frequency_max
-        self.mel_filters = self.get_mel_filter_banks(
+        self.mel_filters = get_mel_filter_banks(
             n_freqs=int(1 + n_fft // 2),
             n_mels=feature_size,
             frequency_min=frequency_min,
@@ -123,7 +123,7 @@ def __init__(
             norm=None,
             mel_scale="htk",
         )
-        self.mel_filters_slaney = self.get_mel_filter_banks(
+        self.mel_filters_slaney = get_mel_filter_banks(
             n_freqs=int(1 + n_fft // 2),
             n_mels=feature_size,
             frequency_min=frequency_min,
@@ -161,12 +161,12 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[n
               implementation when the truncation mode is not `"fusion"`.
         """
         window = np.hanning(self.n_fft + 1)[:-1]
-        frames = self._fram_wave(waveform)
-        stft = self._stft(frames, window=window)
+        frames = _fram_wave(waveform)
+        stft = _stft(frames, window=window)
 
         magnitudes = np.abs(stft) ** 2
         mel_spec = np.matmul(mel_filters.T, magnitudes)
-        log_mel_spec = self._power_to_db(mel_spec).T
+        log_mel_spec = _power_to_db(mel_spec).T
         log_mel_spec = np.asarray(log_mel_spec, np.float32)
         return log_mel_spec
 

From aa8594143542876c6aae413f39742a7ca3ce2de4 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 13 Feb 2023 10:49:50 +0000
Subject: [PATCH 152/197] apply suggestions

---
 src/transformers/models/clap/feature_extraction_clap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 51c04710ec1e..33fe164e84a3 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -33,7 +33,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
     r"""
     Constructs a Clap feature extractor.
 
-    This feature extractor inherits from [`ClapFeatureExtractor`] which contains most of the main methods. Users should
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains most of the main methods. Users should
     refer to this superclass for more information regarding those methods.
 
     This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time

From a45b2218e281f138f8f355d5816a286b0b35d7a9 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 10:56:33 +0000
Subject: [PATCH 153/197] Empty commit


From b9ee298cd8f404df77129eb3a18f5ec15c6b556e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 11:05:36 +0000
Subject: [PATCH 154/197] pipeline will be added in a different PR

---
 docs/source/en/main_classes/pipelines.mdx     |   5 -
 .../zero_shot_audio_classification.py         | 163 ------------------
 ...ipelines_zero_shot_audio_classification.py |  73 --------
 3 files changed, 241 deletions(-)
 delete mode 100644 src/transformers/pipelines/zero_shot_audio_classification.py
 delete mode 100644 tests/pipelines/test_pipelines_zero_shot_audio_classification.py

diff --git a/docs/source/en/main_classes/pipelines.mdx b/docs/source/en/main_classes/pipelines.mdx
index 96bae3530e2c..e5ee3902028e 100644
--- a/docs/source/en/main_classes/pipelines.mdx
+++ b/docs/source/en/main_classes/pipelines.mdx
@@ -314,11 +314,6 @@ Pipelines available for audio tasks include the following.
     - __call__
     - all
 
-### ZeroShotAudioClassificationPipeline
-
-[[autodoc]] ZeroShotAudioClassificationPipeline
-    - __call__
-    - all
 ## Computer vision
 
 Pipelines available for computer vision tasks include the following.
diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
deleted file mode 100644
index 7ac9eeb6b8c8..000000000000
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ /dev/null
@@ -1,163 +0,0 @@
-from typing import Union
-
-import numpy as np
-import requests
-
-from ..utils import (
-    add_end_docstrings,
-    is_torch_available,
-    logging,
-)
-from .audio_classification import ffmpeg_read
-from .base import PIPELINE_INIT_ARGS, ChunkPipeline
-
-
-if is_torch_available():
-    import torch
-
-
-logger = logging.get_logger(__name__)
-
-
-@add_end_docstrings(PIPELINE_INIT_ARGS)
-class ZeroShotAudioClassificationPipeline(ChunkPipeline):
-    """
-    Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
-    provide an audio and a set of `candidate_labels`.
-
-    Example:
-
-    ```python
-    >>> from transformers import pipeline
-    >>> from datasets import load_dataset
-
-    >>> dataset = load_dataset("ashraq/esc50")
-    >>> audio = next(iter(dataset["train"]["audio"]))["array"]
-
-    >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion-ai/clap-hsat-tiny")
-    >>> classifier(
-    ...     audio,
-    ...     candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"],
-    ... )
-    [{'score': 0.999727189540863, 'label': 'Sound of a dog'}, {'score': 0.0002727957325987518, 'label': 'Sound of vaccum cleaner'}]
-    ```
-
-    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
-
-    This audio classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
-    `"zero-shot-audio-classification"`.
-
-    See the list of available models on
-    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-        if self.framework != "pt":
-            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
-        # No specific FOR_XXX available yet
-        # self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING)
-
-    def __call__(
-        self,
-        audios: Union[np.ndarray, bytes, str],
-        **kwargs,
-    ):
-        """
-        Assign labels to the audio(s) passed as inputs.
-
-        Args:
-            audios (`str`, `List[str]`, `np.array` or `List[np.array]`):
-                The pipeline handles three types of inputs:
-
-                - A string containing a http link pointing to an audio
-                - A string containing a local path to an audio
-                - An audio loaded in numpy
-
-            candidate_labels (`List[str]`):
-                The candidate labels for this audio
-
-            hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
-                The sentence used in cunjunction with *candidate_labels* to attempt the audio classification by
-                replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
-                logits_per_audio
-
-        Return:
-            A list of dictionaries containing result, one dictionary per proposed label. The dictionaries contain the
-            following keys:
-
-            - **label** (`str`) -- The label identified by the model. It is one of the suggested `candidate_label`.
-            - **score** (`float`) -- The score attributed by the model for that label (between 0 and 1).
-        """
-        return super().__call__(audios, **kwargs)
-
-    def _sanitize_parameters(self, **kwargs):
-        preprocess_params = {}
-        if "candidate_labels" in kwargs:
-            preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
-        if "hypothesis_template" in kwargs:
-            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
-
-        return preprocess_params, {}, {}
-
-    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a sound of {}."):
-        if isinstance(audio, str):
-            if audio.startswith("http://") or audio.startswith("https://"):
-                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
-                # like http_huggingface_co.png
-                audio = requests.get(audio).content
-            else:
-                with open(audio, "rb") as f:
-                    audio = f.read()
-
-        if isinstance(audio, bytes):
-            audio = ffmpeg_read(audio, self.feature_extractor.sampling_rate)
-
-        if not isinstance(audio, np.ndarray):
-            raise ValueError("We expect a numpy ndarray as input")
-        if len(audio.shape) != 1:
-            raise ValueError("We expect a single channel audio input for ZeroShotAudioClassificationPipeline")
-
-        n = len(candidate_labels)
-        for i, candidate_label in enumerate(candidate_labels):
-            audios = self.feature_extractor(
-                audio, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
-            )
-            sequence = hypothesis_template.format(candidate_label)
-            inputs = self.tokenizer(sequence, return_tensors=self.framework)
-            inputs["input_features"] = audios.input_features
-            yield {"is_last": i == n - 1, "candidate_label": candidate_label, **inputs}
-
-    def _forward(self, model_inputs):
-        is_last = model_inputs.pop("is_last")
-        candidate_label = model_inputs.pop("candidate_label")
-        outputs = self.model(**model_inputs)
-
-        # Clap does crossproduct scoring by default, so we're only
-        # interested in the results where audio and text and in the same
-        # batch position.
-        diag = torch.diagonal
-        logits_per_audio = diag(outputs.logits_per_audio)
-
-        model_outputs = {
-            "is_last": is_last,
-            "candidate_label": candidate_label,
-            "logits_per_audio": logits_per_audio,
-        }
-        return model_outputs
-
-    def postprocess(self, model_outputs):
-        candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
-        if self.framework == "pt":
-            logits = torch.cat([output["logits_per_audio"] for output in model_outputs])
-            probs = logits.softmax(dim=0)
-            scores = probs.tolist()
-        else:
-            raise ValueError("`tf` framework not supported.")
-
-        result = [
-            {"score": score, "label": candidate_label}
-            for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
-        ]
-        return result
diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
deleted file mode 100644
index 80e42124a886..000000000000
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from datasets import load_dataset
-
-from transformers.pipelines import pipeline
-from transformers.testing_utils import nested_simplify, require_torch, slow
-
-from .test_pipelines_common import PipelineTestCaseMeta
-
-
-@require_torch
-class ZeroShotAudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
-    # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping,
-    # and only Clap would be there for now.
-    # model_mapping = {ClapConfig: ClapModel}
-
-    @require_torch
-    def test_small_model_pt(self):
-        pass
-
-    def test_small_model_tf(self):
-        pass
-
-    @slow
-    @require_torch
-    def test_large_model_pt(self):
-        audio_classifier = pipeline(
-            task="zero-shot-audio-classification",
-            model="ybelkada/clap-htsat-unfused",
-        )
-        # This is an audio of a dog
-        dataset = load_dataset("ashraq/esc50")
-        audio = dataset["train"]["audio"][-1]["array"]
-        output = audio_classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
-
-        self.assertEqual(
-            nested_simplify(output),
-            [
-                {"score": 0.999, "label": "Sound of a dog"},
-                {"score": 0.001, "label": "Sound of vaccum cleaner"},
-            ],
-        )
-
-        output = audio_classifier([audio] * 5, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
-        self.assertEqual(
-            nested_simplify(output),
-            [
-                [
-                    {"score": 0.999, "label": "Sound of a dog"},
-                    {"score": 0.001, "label": "Sound of vaccum cleaner"},
-                ],
-            ]
-            * 5,
-        )
-        # TODO batching will be supported in next PR, the base pipeline needs to be modified
-        # output = audio_classifier([audio] * 5, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"], batch_size=5)
-
-    def test_large_model_tf(self):
-        pass

From a47f063a2c7c4278e0f2c0eba03585178726ae8a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 11:10:25 +0000
Subject: [PATCH 155/197] update calls to audio utils functions

---
 src/transformers/audio_utils.py                         | 8 +++-----
 src/transformers/models/clap/feature_extraction_clap.py | 4 ++--
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index 58f73761babc..eb91a3cbd8d5 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -61,7 +61,6 @@ def hertz_to_mel(freq: float, mel_scale: str = "htk") -> float:
     return mels
 
 
-@staticmethod
 def mel_to_hertz(mels: np.array, mel_scale: str = "htk") -> np.array:
     """Convert mel bin numbers to frequencies.
 
@@ -98,8 +97,7 @@ def mel_to_hertz(mels: np.array, mel_scale: str = "htk") -> np.array:
     return freqs
 
 
-@staticmethod
-def create_triangular_filterbank(
+def _create_triangular_filterbank(
     all_freqs: np.array,
     f_pts: np.array,
 ) -> np.array:
@@ -168,7 +166,7 @@ def get_mel_filter_banks(
         frequency_max (`float`):
             Maximum frequency of interest(Hertz).
         n_mels (`int`):
-            Number of mel filterbanks.
+            Number of mel filterbanks. TODO 80 seems a bit high? 
         sample_rate (`int`):
             Sample rate of the audio waveform
         norm (`str`, *optional*):
@@ -198,7 +196,7 @@ def get_mel_filter_banks(
     f_pts = mel_to_hertz(m_pts, mel_scale=mel_scale)
 
     # create filterbank
-    filterbank = create_triangular_filterbank(all_freqs, f_pts)
+    filterbank = _create_triangular_filterbank(all_freqs, f_pts)
 
     if norm is not None and norm == "slaney":
         # Slaney-style mel is scaled to be approx constant energy per channel
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 33fe164e84a3..27f89caa6e62 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -161,8 +161,8 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[n
               implementation when the truncation mode is not `"fusion"`.
         """
         window = np.hanning(self.n_fft + 1)[:-1]
-        frames = _fram_wave(waveform)
-        stft = _stft(frames, window=window)
+        frames = _fram_wave(waveform, self.hop_length, self.n_fft)
+        stft = _stft(frames, window=window, fft_size = self.n_fft)
 
         magnitudes = np.abs(stft) ** 2
         mel_spec = np.matmul(mel_filters.T, magnitudes)

From 9eb1ab44173cf5a927d855c4fe423efe7f671400 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 11:13:48 +0000
Subject: [PATCH 156/197] update pipeline init

---
 src/transformers/pipelines/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index b91ad6c2a3b7..68774ad50e4a 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -78,7 +78,6 @@
 )
 from .video_classification import VideoClassificationPipeline
 from .visual_question_answering import VisualQuestionAnsweringPipeline
-from .zero_shot_audio_classification import ZeroShotAudioClassificationPipeline
 from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
 from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
 from .zero_shot_object_detection import ZeroShotObjectDetectionPipeline

From fd1957bd9f374bf24b4e1c9afdafa2b6f0665317 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 11:18:06 +0000
Subject: [PATCH 157/197] style

---
 src/transformers/audio_utils.py                       |  2 +-
 .../models/clap/feature_extraction_clap.py            |  7 ++++---
 src/transformers/pipelines/__init__.py                | 11 -----------
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index eb91a3cbd8d5..8b0a95824e87 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -166,7 +166,7 @@ def get_mel_filter_banks(
         frequency_max (`float`):
             Maximum frequency of interest(Hertz).
         n_mels (`int`):
-            Number of mel filterbanks. TODO 80 seems a bit high? 
+            Number of mel filterbanks. TODO 80 seems a bit high?
         sample_rate (`int`):
             Sample rate of the audio waveform
         norm (`str`, *optional*):
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 27f89caa6e62..2d12ba169b12 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -20,11 +20,12 @@
 
 import numpy as np
 
+from ...audio_utils import _fram_wave, _power_to_db, _stft, get_mel_filter_banks
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
-from ...image_transforms import np_bilinear_resize #TODO this has to be removed
+from ...image_transforms import np_bilinear_resize  # TODO this has to be removed
 from ...utils import TensorType, logging
-from ...audio_utils import get_mel_filter_banks, _fram_wave, _power_to_db, _stft
+
 
 logger = logging.get_logger(__name__)
 
@@ -162,7 +163,7 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[n
         """
         window = np.hanning(self.n_fft + 1)[:-1]
         frames = _fram_wave(waveform, self.hop_length, self.n_fft)
-        stft = _stft(frames, window=window, fft_size = self.n_fft)
+        stft = _stft(frames, window=window, fft_size=self.n_fft)
 
         magnitudes = np.abs(stft) ** 2
         mel_spec = np.matmul(mel_filters.T, magnitudes)
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 68774ad50e4a..0bb26c80015e 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -299,17 +299,6 @@
         },
         "type": "multimodal",
     },
-    "zero-shot-audio-classification": {
-        "impl": ZeroShotAudioClassificationPipeline,
-        "tf": (TFAutoModel,) if is_tf_available() else (),
-        "pt": (AutoModel,) if is_torch_available() else (),
-        "default": {
-            "model": {
-                "pt": ("laion-ai/clap-hsat-tiny", "f4881ba"),
-            }
-        },
-        "type": "multimodal",
-    },
     "conversational": {
         "impl": ConversationalPipeline,
         "tf": (TFAutoModelForSeq2SeqLM, TFAutoModelForCausalLM) if is_tf_available() else (),

From 773b225076a943ed18be3831e4f83f4d53b92532 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 11:19:05 +0000
Subject: [PATCH 158/197] style

---
 src/transformers/models/clap/configuration_clap.py      | 6 +++---
 src/transformers/models/clap/feature_extraction_clap.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index d1eb6151def8..ef2a0a6d525f 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -31,9 +31,9 @@
 
 class ClapTextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ClapTextModel`]. It is
-    used to instantiate a RoBERTa model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
+    This is the configuration class to store the configuration of a [`ClapTextModel`]. It is used to instantiate a
+    RoBERTa model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the RoBERTa
     [roberta-base](https://huggingface.co/roberta-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 2d12ba169b12..4530229700bf 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -34,8 +34,8 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
     r"""
     Constructs a Clap feature extractor.
 
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
 
     This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
     Fourier Transform` (STFT) which should match pytorch's `torch.stft` equivalent.

From c7a7cf6385449715b0879a236a6f6bbc5eeb33e3 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 11:19:54 +0000
Subject: [PATCH 159/197] styling again

---
 src/transformers/models/clap/modeling_clap.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 39115ba01164..0be8037efc22 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -171,11 +171,11 @@ class ClapAudioModelOutput(ModelOutput):
 
     Args:
         framewise_output (`torch.FloatTensor` of shape `(batch_size, reshaped_hidden_size, num_classes)`):
-            Output hidden_states that are interpolated after applying sigmoid. These logits are used to compute the
-            the classification label in the original implementation.
+            Output hidden_states that are interpolated after applying sigmoid. These logits are used to compute the the
+            classification label in the original implementation.
         clipwise_output (`torch.FloatTensor` of shape `(batch_size, hidden_size, num_classes)`):
-            Output hidden_states after applying sigmoid. These logits are used to compute the
-            the classification label in the original implementation.
+            Output hidden_states after applying sigmoid. These logits are used to compute the the classification label
+            in the original implementation.
         fine_grained_embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
             Pooled interpolated hidden_states.
         embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
@@ -210,11 +210,11 @@ class ClapAudioModelOutputWithProjection(ModelOutput):
         audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
             The audio embeddings obtained by applying the projection layer to the pooler_output.
         framewise_output (`torch.FloatTensor` of shape `(batch_size, reshaped_hidden_size, num_classes)`):
-            Output hidden_states that are interpolated after applying sigmoid. These logits are used to compute the
-            the classification label in the original implementation.
+            Output hidden_states that are interpolated after applying sigmoid. These logits are used to compute the the
+            classification label in the original implementation.
         clipwise_output (`torch.FloatTensor` of shape `(batch_size, hidden_size, num_classes)`):
-            Output hidden_states after applying sigmoid. These logits are used to compute the
-            the classification label in the original implementation.
+            Output hidden_states after applying sigmoid. These logits are used to compute the the classification label
+            in the original implementation.
         fine_grained_embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
             Pooled interpolated hidden_states.
         embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):

From c970399e6427d2bb3a00a0a38007ea6c7d4e0db9 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 13 Feb 2023 11:21:44 +0000
Subject: [PATCH 160/197] use pad

---
 src/transformers/models/clap/modeling_clap.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 39115ba01164..977e2b65fab0 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -422,20 +422,11 @@ def forward(self, hidden_states, is_longer_idx=None):
                     local_hidden_states.size(3),
                 )
                 local_hidden_states = local_hidden_states.permute((0, 2, 3, 1, 4)).contiguous().flatten(3)
-                output_batch_size, output_num_channels, output_height, _ = local_hidden_states.size()
 
                 local_width = local_hidden_states.size(-1)
                 if local_width < output_width:
-                    padded_hidden_states = torch.zeros(
-                        (output_batch_size, output_num_channels, output_height, output_width - local_width)
-                    ).to(global_hidden_states.device)
-
-                    local_hidden_states = torch.cat(
-                        [
-                            local_hidden_states,
-                            padded_hidden_states,
-                        ],
-                        dim=-1,
+                    local_hidden_states = torch.nn.functional.pad(
+                        local_hidden_states, (0, output_width - local_width), "constant", 0
                     )
                 else:
                     local_hidden_states = local_hidden_states[:, :, :, :output_width]

From 21f60ceee0fddd60e6ab0055098e1038e8c9d3d0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 11:22:02 +0000
Subject: [PATCH 161/197] fix repo-consistency

---
 src/transformers/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5d6a6359ae76..6d9d2c540e01 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -546,7 +546,6 @@
         "TranslationPipeline",
         "VideoClassificationPipeline",
         "VisualQuestionAnsweringPipeline",
-        "ZeroShotAudioClassificationPipeline",
         "ZeroShotClassificationPipeline",
         "ZeroShotImageClassificationPipeline",
         "ZeroShotObjectDetectionPipeline",
@@ -4020,7 +4019,6 @@
         TranslationPipeline,
         VideoClassificationPipeline,
         VisualQuestionAnsweringPipeline,
-        ZeroShotAudioClassificationPipeline,
         ZeroShotClassificationPipeline,
         ZeroShotImageClassificationPipeline,
         ZeroShotObjectDetectionPipeline,

From 9ca17809403042e878e68b3f2959fe5923474b6f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 12:00:31 +0000
Subject: [PATCH 162/197] update utils and add doc for audio utils

---
 docs/source/en/internal/audio_utils.mdx | 34 +++++++++++++++++++++++++
 src/transformers/__init__.py            |  1 +
 src/transformers/pipelines/__init__.py  |  1 -
 3 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/internal/audio_utils.mdx

diff --git a/docs/source/en/internal/audio_utils.mdx b/docs/source/en/internal/audio_utils.mdx
new file mode 100644
index 000000000000..a5c7ecdf9d44
--- /dev/null
+++ b/docs/source/en/internal/audio_utils.mdx
@@ -0,0 +1,34 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Utilities for `FeatureExtractors`
+
+This page lists all the utility functions that can be used by the audio `FeatureExtractor` in order to compute special features from a raw audio using comon algorightms such as `Short Time Fourier Transform` or `Mel log spectrogram`.
+
+
+Most of those are only useful if you are studying the code of the image processors in the library.
+
+## Audio Transformations
+
+[[autodoc]] audio_utils.hertz_to_mel
+
+[[autodoc]] audio_utils.mel_to_hertz
+
+[[autodoc]] audio_utils.get_mel_filter_banks
+
+[[autodoc]] audio_utils._stft
+
+[[autodoc]] audio_utils._power_to_db
+
+[[autodoc]] audio_utils._fram_wave
+
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 6d9d2c540e01..5b8a6b6460a4 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -47,6 +47,7 @@
 
 # Base objects, independent of any specific backend
 _import_structure = {
+    "audio_utils": [],
     "benchmark": [],
     "commands": [],
     "configuration_utils": ["PretrainedConfig"],
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 0bb26c80015e..0446064939c0 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -532,7 +532,6 @@ def pipeline(
             - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
             - `"video-classification"`: will return a [`VideoClassificationPipeline`].
             - `"visual-question-answering"`: will return a [`VisualQuestionAnsweringPipeline`].
-            - `"zero-shot-audio-classification"`: will return a [`ZeroShotAudioClassificationPipeline`].
             - `"zero-shot-classification"`: will return a [`ZeroShotClassificationPipeline`].
             - `"zero-shot-image-classification"`: will return a [`ZeroShotImageClassificationPipeline`].
             - `"zero-shot-object-detection"`: will return a [`ZeroShotObjectDetectionPipeline`].

From 0e0990ace59005a6a510db71102f97a3886b819a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 12:33:18 +0000
Subject: [PATCH 163/197] clean up resize by using torch. update inits
 accordingly

---
 docs/source/en/internal/audio_utils.mdx       |  6 +-
 src/transformers/__init__.py                  |  4 +-
 src/transformers/audio_utils.py               |  6 +-
 src/transformers/image_transforms.py          | 93 -------------------
 src/transformers/models/clap/__init__.py      |  6 +-
 .../models/clap/feature_extraction_clap.py    | 24 ++---
 src/transformers/utils/dummy_pt_objects.py    |  5 +
 7 files changed, 29 insertions(+), 115 deletions(-)

diff --git a/docs/source/en/internal/audio_utils.mdx b/docs/source/en/internal/audio_utils.mdx
index a5c7ecdf9d44..9c603b605777 100644
--- a/docs/source/en/internal/audio_utils.mdx
+++ b/docs/source/en/internal/audio_utils.mdx
@@ -25,10 +25,10 @@ Most of those are only useful if you are studying the code of the image processo
 
 [[autodoc]] audio_utils.get_mel_filter_banks
 
-[[autodoc]] audio_utils._stft
+[[autodoc]] audio_utils.stft
 
-[[autodoc]] audio_utils._power_to_db
+[[autodoc]] audio_utils.power_to_db
 
-[[autodoc]] audio_utils._fram_wave
+[[autodoc]] audio_utils.fram_wave
 
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5b8a6b6460a4..b7263cc6ac0f 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -211,7 +211,6 @@
         "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
         "ClapAudioConfig",
         "ClapConfig",
-        "ClapFeatureExtractor",
         "ClapProcessor",
         "ClapTextConfig",
     ],
@@ -1233,6 +1232,7 @@
     _import_structure["models.clap"].extend(
         [
             "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ClapFeatureExtractor",
             "ClapAudioModel",
             "ClapAudioModelWithProjection",
             "ClapModel",
@@ -3710,7 +3710,6 @@
         CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
         ClapAudioConfig,
         ClapConfig,
-        ClapFeatureExtractor,
         ClapProcessor,
         ClapTextConfig,
     )
@@ -4603,6 +4602,7 @@
         )
         from .models.clap import (
             CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ClapFeatureExtractor,
             ClapAudioModel,
             ClapAudioModelWithProjection,
             ClapModel,
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index 8b0a95824e87..c0e7b0b4a566 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -213,7 +213,7 @@ def get_mel_filter_banks(
     return filterbank
 
 
-def _stft(frames: np.array, window: np.array, fft_size: int = None):
+def stft(frames: np.array, window: np.array, fft_size: int = None):
     """
     Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
     as `torch.stft`. #TODO @Arthur batching this could allow more usage, good first issue.
@@ -253,7 +253,7 @@ def _stft(frames: np.array, window: np.array, fft_size: int = None):
     return data.T
 
 
-def _power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
+def power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
     """
     Convert a mel spectrogram from power to db scale, this function is the numpy implementation of librosa.power_to_lb.
 
@@ -283,7 +283,7 @@ def _power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
     return log_spec
 
 
-def _fram_wave(waveform: np.array, hop_length: int = 160, n_fft: int = 400, center: bool = True):
+def fram_wave(waveform: np.array, hop_length: int = 160, n_fft: int = 400, center: bool = True):
     """
     In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
     segments called `frames`.
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 7269c31078af..1323f3a53192 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -708,96 +708,3 @@ def convert_to_rgb(image: ImageInput) -> ImageInput:
 
     image = image.convert("RGB")
     return image
-
-
-def bilinear_interpolation(image: np.ndarray, y: float, x: float):
-    # docstyle-ignore
-    """
-    A bilinear interpolation of the estimated values of the `image` at non integer indexes `y` and `x`.
-    
-    
-    Original Image at                    Original Image at
-      x_1, y_1                             x_1, y_2 
-        +---+                               +---+       
-        | +-|-------------------------------|-+ |       
-        +---+                               +---+       
-            |                                   |         
-            |             Pixel at (x,y) where  |         
-            |             x and y non integers  |         
-            |                     +---+         |         
-            |                     |   |         |         
-            |                     +---+         |         
-        +---+                               +---+       
-        | +-|-------------------------------|-+ |       
-        +---+                               +---+       
-                                                        
-    Original Image at                    Original Image at
-      x_1, y_2                             x_2, y_2 
-    
-    The estimated value of the pixel is computed using the following equation :
-    
-    $$ \text{Image}_{x,y} = \frac{1}{(x_1 - x_2)(y_2-y_1)} \begin{bmatrix} x_2 - x & x - x_1\end{bmatrix}
-    \begin{bmatrix} \text{Image}_{x_1,y_1} & \text{Image}_{x_2,y_1}\\ \text{Image}_{x_1,y_2} & \text{Image}_{x_2,y_2}\\
-    \end{bmatrix} \begin{bmatrix} y_2 - y \\ y-y_2\end{bmatrix} $$
-    
-    For more details about bilinear interplation, see [on the wikipedia
-    page](https://en.wikipedia.org/wiki/Bilinear_interpolation)
-    """
-    height = image.shape[0]
-    width = image.shape[1]
-
-    x1 = max(min(math.floor(x), width - 1), 0)
-    y1 = max(min(math.floor(y), height - 1), 0)
-    x2 = max(min(math.ceil(x), width - 1), 0)
-    y2 = max(min(math.ceil(y), height - 1), 0)
-
-    a = image[y1, x1]
-    b = image[y2, x1]
-    c = image[y1, x2]
-    d = image[y2, x2]
-
-    dx = x - x1
-    dy = y - y1
-
-    new_pixel = a * (1 - dx) * (1 - dy)
-    new_pixel += b * dy * (1 - dx)
-    new_pixel += c * dx * (1 - dy)
-    new_pixel += d * dx * dy
-    return new_pixel
-
-
-def np_bilinear_resize(image: np.ndarray, new_height: int, new_width: int):
-    """
-    Taken from `[here](https://stackoverflow.com/questions/70024313/resize-using-bilinear-interpolation-in-python)`
-    this is the equivalent of the `torchvision.transforms.Resize(size=[chunk_frames, self.feature_size])`. This
-    function is not optimal in terms of performances, but has the same results as `torchvision` counterpart when called
-    with the default `bilinear` interpolation.
-    """
-    # new_image = [[0 for _ in range(new_width)] for _ in range(new_height)]
-    new_image = np.zeros((new_height, new_width), image.dtype)
-
-    orig_height = image.shape[0]
-    orig_width = image.shape[1]
-
-    # Compute center column and center row
-    x_orig_center = (orig_width - 1) / 2
-    y_orig_center = (orig_height - 1) / 2
-
-    # Compute center of resized image
-    x_scaled_center = (new_width - 1) / 2
-    y_scaled_center = (new_height - 1) / 2
-
-    # Compute the scale in both axes
-    scale_x = orig_width / new_width
-    scale_y = orig_height / new_height
-
-    for y in range(new_height):
-        for x in range(new_width):
-            # compute the coordinates of the `new pixel` at `(x, y)` in the original image.
-            x_ = (x - x_scaled_center) * scale_x + x_orig_center
-            y_ = (y - y_scaled_center) * scale_y + y_orig_center
-
-            # compute the coordinates of the 4 neighboring points and then compute the bilinear estimate.
-            new_image[y, x] = bilinear_interpolation(image, y_, x_)
-
-    return new_image
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index e7bd202888da..438eed862d63 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -23,7 +23,6 @@
         "ClapConfig",
         "ClapTextConfig",
     ],
-    "feature_extraction_clap": ["ClapFeatureExtractor"],
     "processing_clap": ["ClapProcessor"],
     "tokenization_clap": ["ClapTokenizer"],
 }
@@ -43,6 +42,7 @@
         "ClapAudioModel",
         "ClapAudioModelWithProjection",
     ]
+    _import_structure["feature_extraction_clap"] = ["ClapFeatureExtractor"]
 
 if TYPE_CHECKING:
     from .configuration_clap import (
@@ -51,9 +51,7 @@
         ClapConfig,
         ClapTextConfig,
     )
-    from .feature_extraction_clap import ClapFeatureExtractor
     from .processing_clap import ClapProcessor
-    from .tokenization_clap import ClapTokenizer
 
     try:
         if not is_torch_available():
@@ -70,6 +68,8 @@
             ClapTextModel,
             ClapTextModelWithProjection,
         )
+        from .feature_extraction_clap import ClapFeatureExtractor
+        
 
 else:
     import sys
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 4530229700bf..103f74f3fae5 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -20,10 +20,10 @@
 
 import numpy as np
 
-from ...audio_utils import _fram_wave, _power_to_db, _stft, get_mel_filter_banks
+import torch
+from ...audio_utils import fram_wave, power_to_db, stft, get_mel_filter_banks
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
-from ...image_transforms import np_bilinear_resize  # TODO this has to be removed
 from ...utils import TensorType, logging
 
 
@@ -65,7 +65,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
             The highest frequency of interest. The STFT will not be computed for values above this.
         top_db (`float`, *optional*):
             The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the
-            `SequenceFeatureExtractor._power_to_db` function
+            `audio_utils.power_to_db` function
         truncation (`str`, *optional*, default to `"fusions"`):
             Truncation pattern for long audio inputs. Two patterns are available:
                 - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a
@@ -162,14 +162,14 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[n
               implementation when the truncation mode is not `"fusion"`.
         """
         window = np.hanning(self.n_fft + 1)[:-1]
-        frames = _fram_wave(waveform, self.hop_length, self.n_fft)
-        stft = _stft(frames, window=window, fft_size=self.n_fft)
+        frames = fram_wave(waveform, self.hop_length, self.n_fft)
+        spectrogram = stft(frames, window=window, fft_size=self.n_fft)
 
-        magnitudes = np.abs(stft) ** 2
-        mel_spec = np.matmul(mel_filters.T, magnitudes)
-        log_mel_spec = _power_to_db(mel_spec).T
-        log_mel_spec = np.asarray(log_mel_spec, np.float32)
-        return log_mel_spec
+        magnitudes = np.abs(spectrogram) ** 2
+        mel_spectrogram = np.matmul(mel_filters.T, magnitudes)
+        log_mel_spectrogram = power_to_db(mel_spectrogram).T
+        log_mel_spectrogram = np.asarray(log_mel_spectrogram, np.float32)
+        return log_mel_spectrogram
 
     def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
@@ -188,7 +188,9 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]
         mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
 
-        mel_shrink = np_bilinear_resize(mel, chunk_frames, self.feature_size)
+        mel = torch.tensor(mel[None, None, :])
+        mel_shrink = torch.nn.functional.interpolate(mel, size=[chunk_frames, 64], mode = "bilinear", align_corners = False, antialias = False)
+        mel_shrink = mel_shrink[0][0].numpy()
         mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink], axis=0)
         return mel_fusion
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 6236b57a5170..cf781dc886d1 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1466,6 +1466,11 @@ def __init__(self, *args, **kwargs):
 
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
+class ClapFeatureExtractor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 
 class ClapAudioModel(metaclass=DummyObject):
     _backends = ["torch"]

From 1c82889180c111b2bceda251d55372827fe115ed Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 12:34:13 +0000
Subject: [PATCH 164/197] style

---
 src/transformers/__init__.py                            | 4 ++--
 src/transformers/image_transforms.py                    | 1 -
 src/transformers/models/clap/__init__.py                | 4 ++--
 src/transformers/models/clap/feature_extraction_clap.py | 8 +++++---
 src/transformers/utils/dummy_pt_objects.py              | 2 ++
 5 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b7263cc6ac0f..4ee1aa83406f 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1232,9 +1232,9 @@
     _import_structure["models.clap"].extend(
         [
             "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "ClapFeatureExtractor",
             "ClapAudioModel",
             "ClapAudioModelWithProjection",
+            "ClapFeatureExtractor",
             "ClapModel",
             "ClapPreTrainedModel",
             "ClapTextModel",
@@ -4602,9 +4602,9 @@
         )
         from .models.clap import (
             CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ClapFeatureExtractor,
             ClapAudioModel,
             ClapAudioModelWithProjection,
+            ClapFeatureExtractor,
             ClapModel,
             ClapPreTrainedModel,
             ClapTextModel,
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 1323f3a53192..d09f29b79044 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 import warnings
 from typing import Iterable, List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index 438eed862d63..f7b79c6675cd 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -59,6 +59,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
+        from .feature_extraction_clap import ClapFeatureExtractor
         from .modeling_clap import (
             CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
             ClapAudioModel,
@@ -68,8 +69,7 @@
             ClapTextModel,
             ClapTextModelWithProjection,
         )
-        from .feature_extraction_clap import ClapFeatureExtractor
-        
+
 
 else:
     import sys
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 103f74f3fae5..62da30e85d30 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -19,9 +19,9 @@
 from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
-
 import torch
-from ...audio_utils import fram_wave, power_to_db, stft, get_mel_filter_banks
+
+from ...audio_utils import fram_wave, get_mel_filter_banks, power_to_db, stft
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
@@ -189,7 +189,9 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
 
         mel = torch.tensor(mel[None, None, :])
-        mel_shrink = torch.nn.functional.interpolate(mel, size=[chunk_frames, 64], mode = "bilinear", align_corners = False, antialias = False)
+        mel_shrink = torch.nn.functional.interpolate(
+            mel, size=[chunk_frames, 64], mode="bilinear", align_corners=False, antialias=False
+        )
         mel_shrink = mel_shrink[0][0].numpy()
         mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink], axis=0)
         return mel_fusion
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index cf781dc886d1..938f96d9dca7 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1466,12 +1466,14 @@ def __init__(self, *args, **kwargs):
 
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
+
 class ClapFeatureExtractor(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
+
 class ClapAudioModel(metaclass=DummyObject):
     _backends = ["torch"]
 

From 626a664a52e9be47b2bd159befb59f781ad75d3e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 12:36:55 +0000
Subject: [PATCH 165/197] CLap's  tokenizer is RobertA

---
 src/transformers/models/clap/processing_clap.py | 10 +++++-----
 src/transformers/utils/dummy_pt_objects.py      |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index 36d9ff24ea77..603fba91d3b2 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -24,13 +24,13 @@ class ClapProcessor(ProcessorMixin):
     r"""
     Constructs a Clap processor which wraps a Clap feature extractor and a Clap tokenizer into a single processor.
 
-    [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`ClapTokenizerFast`]. See the
+    [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`RobertaTokenizerFast`]. See the
     [`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information.
 
     Args:
         feature_extractor ([`ClapFeatureExtractor`]):
             The audio processor is a required input.
-        tokenizer ([`ClapTokenizerFast`]):
+        tokenizer ([`RobertaTokenizerFast`]):
             The tokenizer is a required input.
     """
     feature_extractor_class = "ClapFeatureExtractor"
@@ -42,7 +42,7 @@ def __init__(self, feature_extractor, tokenizer):
     def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
         """
         Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
-        and `kwargs` arguments to ClapTokenizerFast's [`~ClapTokenizerFast.__call__`] if `text` is not `None` to encode
+        and `kwargs` arguments to RobertaTokenizerFast's [`~RobertaTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
         ClapFeatureExtractor's [`~ClapFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
         doctsring of the above two methods for more information.
@@ -97,14 +97,14 @@ def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
 
     def batch_decode(self, *args, **kwargs):
         """
-        This method forwards all its arguments to ClapTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
         refer to the docstring of this method for more information.
         """
         return self.tokenizer.batch_decode(*args, **kwargs)
 
     def decode(self, *args, **kwargs):
         """
-        This method forwards all its arguments to ClapTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
         the docstring of this method for more information.
         """
         return self.tokenizer.decode(*args, **kwargs)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 938f96d9dca7..2ac4e2bf65db 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1467,21 +1467,21 @@ def __init__(self, *args, **kwargs):
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class ClapFeatureExtractor(metaclass=DummyObject):
+class ClapAudioModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ClapAudioModel(metaclass=DummyObject):
+class ClapAudioModelWithProjection(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ClapAudioModelWithProjection(metaclass=DummyObject):
+class ClapFeatureExtractor(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):

From a1471b6832ff292f83dbd7ce0807f81394c33602 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 12:41:01 +0000
Subject: [PATCH 166/197] add audio utils to internal toctreee

---
 src/transformers/models/clap/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
index f7b79c6675cd..57e39b6e1fa6 100644
--- a/src/transformers/models/clap/__init__.py
+++ b/src/transformers/models/clap/__init__.py
@@ -24,7 +24,6 @@
         "ClapTextConfig",
     ],
     "processing_clap": ["ClapProcessor"],
-    "tokenization_clap": ["ClapTokenizer"],
 }
 
 try:

From 7ed4f0f5d71aab25b81d24ad38db7dfadc3d9d06 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 12:42:41 +0000
Subject: [PATCH 167/197] update totctree

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 23a506853e23..f30cc0a40503 100755
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -610,6 +610,8 @@
       title: Utilities for Generation
     - local: internal/image_processing_utils
       title: Utilities for Image Processors
+    - local: internal/audio_utils
+      title: Utilities for audio processing
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers

From c07ab1ea0494f7c18cd400caee1bbfd5dafcf938 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 12:59:24 +0000
Subject: [PATCH 168/197] style

---
 src/transformers/models/clap/processing_clap.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index 603fba91d3b2..4bec589d731e 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -42,8 +42,8 @@ def __init__(self, feature_extractor, tokenizer):
     def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
         """
         Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
-        and `kwargs` arguments to RobertaTokenizerFast's [`~RobertaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
+        and `kwargs` arguments to RobertaTokenizerFast's [`~RobertaTokenizerFast.__call__`] if `text` is not `None` to
+        encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
         ClapFeatureExtractor's [`~ClapFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
         doctsring of the above two methods for more information.
 
@@ -104,8 +104,8 @@ def batch_decode(self, *args, **kwargs):
 
     def decode(self, *args, **kwargs):
         """
-        This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
-        the docstring of this method for more information.
+        This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
         """
         return self.tokenizer.decode(*args, **kwargs)
 

From 291b8f8e71a2d1a093bab9cad1e508efebc4d4e7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 14:32:36 +0000
Subject: [PATCH 169/197] update documentation and normalize naming accross
 audio utils and feature extraction clap

---
 src/transformers/audio_utils.py               | 165 +++++++++---------
 .../models/clap/feature_extraction_clap.py    |  32 ++--
 2 files changed, 103 insertions(+), 94 deletions(-)

diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index c0e7b0b4a566..4beceb679a3a 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -72,7 +72,7 @@ def mel_to_hertz(mels: np.array, mel_scale: str = "htk") -> np.array:
 
     Returns:
         freqs (`np.array`):
-            Mels converted in Hertz
+            Mels converted to Hertz
     """
 
     if mel_scale not in ["slaney", "htk"]:
@@ -105,33 +105,33 @@ def _create_triangular_filterbank(
 
 
     Args:
-        all_freqs (`np.array`):
-            STFT freq points of size (`n_freqs`).
-        f_pts (`np.array`):
-            Filter mid points of size (`n_filter`).
+        all_freqs (`np.array` of shape (`nb_frequency_bins`, )):
+            Discrete frequencies used when the STFT was computed.
+        f_pts (`np.array`, of shape (`nb_mel_filters`, )):
+            Coordinates of the middle points of the triangular filters to create.
 
     Returns:
         fb (np.array):
-            The filter bank of size (`n_freqs`, `n_filter`).
+            The filter bank of size (`nb_frequency_bins`, `nb_mel_filters`).
     """
     # Adapted from Librosa
     # calculate the difference between each filter mid point and each stft freq point in hertz
     f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
-    slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1)  # (n_freqs, n_filter + 2)
+    slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1)  # (nb_frequency_bins, n_filter + 2)
     # create overlapping triangles
     zero = np.zeros(1)
-    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_filter)
-    up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_filter)
+    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (nb_frequency_bins, n_filter)
+    up_slopes = slopes[:, 2:] / f_diff[1:]  # (nb_frequency_bins, n_filter)
     fb = np.maximum(zero, np.minimum(down_slopes, up_slopes))
 
     return fb
 
 
 def get_mel_filter_banks(
-    n_freqs: int,
+    nb_frequency_bins: int,
+    nb_mel_filters: int,
     frequency_min: float,
     frequency_max: float,
-    n_mels: int,
     sample_rate: int,
     norm: Optional[str] = None,
     mel_scale: str = "htk",
@@ -144,7 +144,7 @@ def get_mel_filter_banks(
     This code is heavily inspired from the *torchaudio* implementation, see
     [here](https://pytorch.org/audio/stable/transforms.html) for more details.
 
-
+    
     Note:
         Different banks of Mel filters were introduced in the litterature. The following variation are supported:
             - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHertz
@@ -159,103 +159,63 @@ def get_mel_filter_banks(
         the `"slaney"` implementation.
 
     Args:
-        n_freqs (`int`):
-            Number of frequencies to highlight/apply.
+        nb_frequency_bins (`int`):
+            Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
+        nb_mel_filters (`int`):
+            Number of Mel filers to generate.
         frequency_min (`float`):
             Minimum frequency of interest(Hertz).
         frequency_max (`float`):
             Maximum frequency of interest(Hertz).
-        n_mels (`int`):
-            Number of mel filterbanks. TODO 80 seems a bit high?
         sample_rate (`int`):
-            Sample rate of the audio waveform
+            Sample rate of the audio waveform.
         norm (`str`, *optional*):
-            If "slaney", divide the triangular mel weights by the width of the mel band (area normalization).
+            If "slaney", divide the triangular Mel weights by the width of the mel band (area normalization).
         mel_scale (`str`, *optional*, `"htk"`):
             Scale to use: `htk` or `slaney`. (Default: `htk`)
 
     Returns:
-        `numpy.ndarray`: Triangular filter banks (fb matrix) of size (`n_freqs`, `n_mels`) meaning number of
-        frequencies to highlight/apply to x the number of filterbanks. Each column is a filterbank so that assuming
-        there is a matrix A of size (..., `n_freqs`), the applied result would be `A * melscale_fbanks(A.size(-1),
-        ...)`.
+        `np.ndarray`: Triangular filter banks (fb matrix) of shape (`nb_frequency_bins`, `nb_mel_filters`). This matrix is a 
+        projection matrix to go from a spectrogram to a Mel Spectrogram.
 
     """
 
     if norm is not None and norm != "slaney":
         raise ValueError('norm must be one of None or "slaney"')
 
-    # freq bins
-    all_freqs = np.linspace(0, sample_rate // 2, n_freqs)
+    # freqency bins
+    all_freqs = np.linspace(0, sample_rate // 2, nb_frequency_bins)
 
-    # calculate mel freq bins
+    # Compute mim and max frequencies in mel scale
     m_min = hertz_to_mel(frequency_min, mel_scale=mel_scale)
     m_max = hertz_to_mel(frequency_max, mel_scale=mel_scale)
 
-    m_pts = np.linspace(m_min, m_max, n_mels + 2)
+    # create the centers of the triangular mel filters.
+    m_pts = np.linspace(m_min, m_max, nb_mel_filters + 2)
     f_pts = mel_to_hertz(m_pts, mel_scale=mel_scale)
 
-    # create filterbank
+    # create the filterbank
     filterbank = _create_triangular_filterbank(all_freqs, f_pts)
 
     if norm is not None and norm == "slaney":
         # Slaney-style mel is scaled to be approx constant energy per channel
-        enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels])
+        enorm = 2.0 / (f_pts[2 : nb_mel_filters + 2] - f_pts[:nb_mel_filters])
         filterbank *= np.expand_dims(enorm, 0)
 
     if (filterbank.max(axis=0) == 0.0).any():
         warnings.warn(
             "At least one mel filterbank has all zero values. "
-            f"The value for `n_mels` ({n_mels}) may be set too high. "
-            f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
+            f"The value for `nb_mel_filters` ({nb_mel_filters}) may be set too high. "
+            f"Or, the value for `nb_frequency_bins` ({nb_frequency_bins}) may be set too low."
         )
 
     return filterbank
 
 
-def stft(frames: np.array, window: np.array, fft_size: int = None):
-    """
-    Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
-    as `torch.stft`. #TODO @Arthur batching this could allow more usage, good first issue.
-
-    Args:
-        frames (`np.array` of dimension `(num_frames, self.n_fft)`):
-            A framed audio signal obtained using `self._fram_wav`.
-        window (`np.array` of dimension `(self.n_freqs, self.n_mels)`:
-            A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at the
-            boundaries of each frame when computing the FFT. Each frame will be multiplied by the window. For more
-            information on this phenomena, called *Spectral leakage*, refer to [this
-            tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
-        fft_size (`int`, *optional*):
-            Defines the frequency resolution of the Fourier Transform. The number of frequency bins used for dividing
-            the window into equal strips A bin is a spectrum sample, and defines the frequency resolution of the
-            window. An increase of the FFT size slows the calculus time proportionnally.
-    """
-    frame_size = frames.shape[1]
-
-    if fft_size is None:
-        fft_size = frame_size
-
-    if fft_size < frame_size:
-        raise ValueError("FFT size must greater or equal the frame size")
-    # number of FFT bins to store
-    num_fft_bins = (fft_size >> 1) + 1
-
-    data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
-    fft_signal = np.zeros(fft_size)
-
-    for f, frame in enumerate(frames):
-        if window is not None:
-            np.multiply(frame, window, out=fft_signal[:frame_size])
-        else:
-            fft_signal[:frame_size] = frame
-        data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
-    return data.T
-
-
 def power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
     """
     Convert a mel spectrogram from power to db scale, this function is the numpy implementation of librosa.power_to_lb.
+    It computes 10 * log10(mel_spectrogram / ref), using basic log properties for stability.
 
     Note:
         The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness on a
@@ -269,9 +229,9 @@ def power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
         top_db (`int`, *optional*):
             The maximum decibel value.
         a_min (`int`, *optional*, default to 1e-10):
-            TODO
+            Minimum value to use when cliping the mel spectrogram.
         ref (`float`, *optional*, default to 1.0):
-            TODO
+            Maximum reference value used to scale the mel_spectrogram.
 
     """
     log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
@@ -283,7 +243,7 @@ def power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
     return log_spec
 
 
-def fram_wave(waveform: np.array, hop_length: int = 160, n_fft: int = 400, center: bool = True):
+def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True):
     """
     In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
     segments called `frames`.
@@ -291,24 +251,28 @@ def fram_wave(waveform: np.array, hop_length: int = 160, n_fft: int = 400, cente
     The window length (window_length) defines how much of the signal is contained in each frame, while the hop length
     defines the step between the beginning of each new frame.
 
-    #TODO @Arthur **This method does not support batching yet as we are mainly focus on inference. If you want this to
+    TODO @Arthur **This method does not support batching yet as we are mainly focus on inference. If you want this to
     be added feel free to open an issue and ping @arthurzucker on Github**
 
     Args:
         waveform (`np.array`) of shape (sample_length,):
             The raw waveform which will be split into smaller chunks.
+        hop_length (`int`, *optional*, defaults to 160):
+            Step between each window of the waveform.
+        fft_window_size (`int`, *optional*, defaults to 400):
+            Defines the size of the window.
         center (`bool`, defaults to `True`):
             Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the
             waveform on the left and on the right.
 
     Return:
-        framed_waveform (`np.array` of shape (`waveform.shape // hop_length , n_fft)`):
+        framed_waveform (`np.array` of shape `(waveform.shape // hop_length , fft_window_size)`):
             The framed waveforms that can be fed to `np.fft`.
     """
     frames = []
     for i in range(0, waveform.shape[0] + 1, hop_length):
-        half_window = (n_fft - 1) // 2 + 1
         if center:
+            half_window = (fft_window_size - 1) // 2 + 1
             start = i - half_window if i > half_window else 0
             end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
             frame = waveform[start:end]
@@ -321,11 +285,56 @@ def fram_wave(waveform: np.array, hop_length: int = 160, n_fft: int = 400, cente
                 frame = np.pad(frame, pad_width=padd_width, mode="reflect")
 
         else:
-            frame = waveform[i : i + n_fft]
+            frame = waveform[i : i + fft_window_size]
             frame_width = frame.shape[0]
             if frame_width < waveform.shape[0]:
-                frame = np.lib.pad(frame, pad_width=(0, n_fft - frame_width), mode="constant", constant_values=0)
+                frame = np.lib.pad(frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0)
         frames.append(frame)
 
     frames = np.stack(frames, 0)
     return frames
+
+
+def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None):
+    """
+    Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
+    as `torch.stft`. TODO @Arthur batching this could allow more usage, good first issue.
+
+    Args:
+        frames (`np.array` of dimension `(num_frames, fft_window_size)`):
+            A framed audio signal obtained using `audio_utils.fram_wav`.
+        windowing_function (`np.array` of dimension `(nb_frequency_bins, nb_mel_filters)`:
+            A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at the
+            boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function. For more
+            information on the discontinuities, called *Spectral leakage*, refer to [this
+            tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
+        fft_window_size (`int`, *optional*):
+            Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the spectrogram. 
+            400 means that the fourrier transform is computed on windows of 400 samples. The number of frequency bins (`nb_frequency_bins`)
+            used to divide the window into equal strips is equal to `(1+fft_window_size)//2`. An increase of the fft_window_size slows the 
+            calculus time proportionnally.
+    
+    Returns:
+        spectrogram (`np.ndarray`): 
+            A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm
+    """
+    frame_size = frames.shape[1]
+
+    if fft_window_size is None:
+        fft_window_size = frame_size
+
+    if fft_window_size < frame_size:
+        raise ValueError("FFT size must greater or equal the frame size")
+    # number of FFT bins to store
+    nb_frequency_bins = (fft_window_size >> 1) + 1
+
+    spectrogram = np.empty((len(frames), nb_frequency_bins), dtype=np.complex64)
+    fft_signal = np.zeros(fft_window_size)
+
+    for f, frame in enumerate(frames):
+        if windowing_function is not None:
+            np.multiply(frame, windowing_function, out=fft_signal[:frame_size])
+        else:
+            fft_signal[:frame_size] = frame
+        spectrogram[f] = fft(fft_signal, axis=0)[:nb_frequency_bins]
+    return spectrogram.T
\ No newline at end of file
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 62da30e85d30..d18e7ea475c6 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -41,9 +41,8 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
     Fourier Transform` (STFT) which should match pytorch's `torch.stft` equivalent.
 
     Args:
-        feature_size (`int`, defaults to 80):
-            The feature dimension of the extracted Mel spectrograms. This corresponds to the number of frequency bins
-            (intervals) that are computed, for each Fourier step.
+        feature_size (`int`, defaults to 64):
+            The feature dimension of the extracted Mel spectrograms. This corresponds to the number of mel filters (`n_mels`).
         sampling_rate (`int`, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
             to warn users if the audio fed to the feature extractor does not have the same sampling rate.
@@ -52,9 +51,9 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
             in smaller `frames` with a step of `hop_length` between each frame.
         max_length_s (`int`, defaults to 10):
             The maximum input lenght of the model in seconds. This is used to pad the audio.
-        n_fft (`int`, defaults to 400):
-            Size of the Fourier transform. This should be the length of a single frame in samples. 400 means that the
-            fourrier transform is computed on 400 samples.
+        fft_window_size (`int`, defaults to 400):
+            Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency resolution of the spectrogram. 
+            400 means that the fourrier transform is computed on windows of 400 samples.
         padding_value (`float`, *optional*, defaults to 0.0):
             Padding value used to pad the audio. Should correspond to silences.
         return_attention_mask (`bool`, *optional*, defaults to `False`):
@@ -84,11 +83,11 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
 
     def __init__(
         self,
-        feature_size=80,
+        feature_size=64,
         sampling_rate=48_000,
         hop_length=480,
         max_length_s=10,
-        n_fft=400,
+        fft_window_size=1024,
         padding_value=0.0,
         return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
         frequency_min: float = 0,
@@ -108,7 +107,8 @@ def __init__(
         self.top_db = top_db
         self.truncation = truncation
         self.padding = padding
-        self.n_fft = n_fft
+        self.fft_window_size = fft_window_size
+        self.nb_frequency_bins = (fft_window_size >> 1) + 1
         self.hop_length = hop_length
         self.max_length_s = max_length_s
         self.nb_max_samples = max_length_s * sampling_rate
@@ -116,8 +116,8 @@ def __init__(
         self.frequency_min = frequency_min
         self.frequency_max = frequency_max
         self.mel_filters = get_mel_filter_banks(
-            n_freqs=int(1 + n_fft // 2),
-            n_mels=feature_size,
+            nb_frequency_bins=self.nb_frequency_bins,
+            nb_mel_filters=feature_size,
             frequency_min=frequency_min,
             frequency_max=frequency_max,
             sample_rate=sampling_rate,
@@ -125,8 +125,8 @@ def __init__(
             mel_scale="htk",
         )
         self.mel_filters_slaney = get_mel_filter_banks(
-            n_freqs=int(1 + n_fft // 2),
-            n_mels=feature_size,
+            nb_frequency_bins=self.nb_frequency_bins,
+            nb_mel_filters=feature_size,
             frequency_min=frequency_min,
             frequency_max=frequency_max,
             sample_rate=sampling_rate,
@@ -161,9 +161,9 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[n
               `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original
               implementation when the truncation mode is not `"fusion"`.
         """
-        window = np.hanning(self.n_fft + 1)[:-1]
-        frames = fram_wave(waveform, self.hop_length, self.n_fft)
-        spectrogram = stft(frames, window=window, fft_size=self.n_fft)
+        window = np.hanning(self.fft_window_size + 1)[:-1]
+        frames = fram_wave(waveform, self.hop_length, self.fft_window_size)
+        spectrogram = stft(frames, window, fft_window_size=self.fft_window_size)
 
         magnitudes = np.abs(spectrogram) ** 2
         mel_spectrogram = np.matmul(mel_filters.T, magnitudes)

From 80a6212d047e84056bece408f7f7ac6867d23fe4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 14:32:58 +0000
Subject: [PATCH 170/197] style

---
 src/transformers/audio_utils.py               | 28 ++++++++++---------
 .../models/clap/feature_extraction_clap.py    |  7 +++--
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index 4beceb679a3a..8c22b135bc42 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -144,7 +144,7 @@ def get_mel_filter_banks(
     This code is heavily inspired from the *torchaudio* implementation, see
     [here](https://pytorch.org/audio/stable/transforms.html) for more details.
 
-    
+
     Note:
         Different banks of Mel filters were introduced in the litterature. The following variation are supported:
             - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHertz
@@ -175,8 +175,8 @@ def get_mel_filter_banks(
             Scale to use: `htk` or `slaney`. (Default: `htk`)
 
     Returns:
-        `np.ndarray`: Triangular filter banks (fb matrix) of shape (`nb_frequency_bins`, `nb_mel_filters`). This matrix is a 
-        projection matrix to go from a spectrogram to a Mel Spectrogram.
+        `np.ndarray`: Triangular filter banks (fb matrix) of shape (`nb_frequency_bins`, `nb_mel_filters`). This matrix
+        is a projection matrix to go from a spectrogram to a Mel Spectrogram.
 
     """
 
@@ -288,7 +288,9 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int =
             frame = waveform[i : i + fft_window_size]
             frame_width = frame.shape[0]
             if frame_width < waveform.shape[0]:
-                frame = np.lib.pad(frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0)
+                frame = np.lib.pad(
+                    frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0
+                )
         frames.append(frame)
 
     frames = np.stack(frames, 0)
@@ -305,17 +307,17 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: int =
             A framed audio signal obtained using `audio_utils.fram_wav`.
         windowing_function (`np.array` of dimension `(nb_frequency_bins, nb_mel_filters)`:
             A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at the
-            boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function. For more
-            information on the discontinuities, called *Spectral leakage*, refer to [this
+            boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function.
+            For more information on the discontinuities, called *Spectral leakage*, refer to [this
             tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
         fft_window_size (`int`, *optional*):
-            Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the spectrogram. 
-            400 means that the fourrier transform is computed on windows of 400 samples. The number of frequency bins (`nb_frequency_bins`)
-            used to divide the window into equal strips is equal to `(1+fft_window_size)//2`. An increase of the fft_window_size slows the 
-            calculus time proportionnally.
-    
+            Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
+            spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
+            frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
+            `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
+
     Returns:
-        spectrogram (`np.ndarray`): 
+        spectrogram (`np.ndarray`):
             A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm
     """
     frame_size = frames.shape[1]
@@ -337,4 +339,4 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: int =
         else:
             fft_signal[:frame_size] = frame
         spectrogram[f] = fft(fft_signal, axis=0)[:nb_frequency_bins]
-    return spectrogram.T
\ No newline at end of file
+    return spectrogram.T
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index d18e7ea475c6..ed9d974e4154 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -42,7 +42,8 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
 
     Args:
         feature_size (`int`, defaults to 64):
-            The feature dimension of the extracted Mel spectrograms. This corresponds to the number of mel filters (`n_mels`).
+            The feature dimension of the extracted Mel spectrograms. This corresponds to the number of mel filters
+            (`n_mels`).
         sampling_rate (`int`, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
             to warn users if the audio fed to the feature extractor does not have the same sampling rate.
@@ -52,8 +53,8 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
         max_length_s (`int`, defaults to 10):
             The maximum input lenght of the model in seconds. This is used to pad the audio.
         fft_window_size (`int`, defaults to 400):
-            Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency resolution of the spectrogram. 
-            400 means that the fourrier transform is computed on windows of 400 samples.
+            Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
+            resolution of the spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples.
         padding_value (`float`, *optional*, defaults to 0.0):
             Padding value used to pad the audio. Should correspond to silences.
         return_attention_mask (`bool`, *optional*, defaults to `False`):

From b29ee047fe908d66a9895982bad41380aa00f601 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 13 Feb 2023 14:55:13 +0000
Subject: [PATCH 171/197] clean up

---
 src/transformers/models/clap/modeling_clap.py | 54 ++++++++++---------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index f905c1c69a33..581cb81e8a56 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -924,12 +924,12 @@ def __init__(self, config):
         )
         self.head = nn.Linear(config.num_classes, config.num_classes)
 
-    def reshape_mel2img(self, normalixed_input_features):
+    def reshape_mel2img(self, normalized_input_features):
         """
         The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
         should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
         """
-        _, _, time_steps, freq_steps = normalixed_input_features.shape
+        _, _, time_steps, freq_steps = normalized_input_features.shape
 
         target_T = int(self.spec_size * self.freq_ratio)
         target_F = self.spec_size // self.freq_ratio
@@ -939,38 +939,44 @@ def reshape_mel2img(self, normalixed_input_features):
 
         # to avoid bicubic zero error
         if time_steps < target_T:
-            normalixed_input_features = nn.functional.interpolate(
-                normalixed_input_features,
-                (target_T, normalixed_input_features.shape[3]),
+            normalized_input_features = nn.functional.interpolate(
+                normalized_input_features,
+                (target_T, normalized_input_features.shape[3]),
                 mode="bicubic",
                 align_corners=True,
             )
         if freq_steps < target_F:
-            normalixed_input_features = nn.functional.interpolate(
-                normalixed_input_features,
-                (normalixed_input_features.shape[2], target_F),
+            normalized_input_features = nn.functional.interpolate(
+                normalized_input_features,
+                (normalized_input_features.shape[2], target_F),
                 mode="bicubic",
                 align_corners=True,
             )
 
-        normalixed_input_features = normalixed_input_features.permute(0, 1, 3, 2).contiguous()
-        normalixed_input_features = normalixed_input_features.reshape(
-            normalixed_input_features.shape[0],
-            normalixed_input_features.shape[1],
-            normalixed_input_features.shape[2],
+        # batch_size, num_channels, target_T, target_F --> batch_size, num_channels, target_F, target_T
+        normalized_input_features = normalized_input_features.permute(0, 1, 3, 2).contiguous()
+
+        # batch_size, num_channels, target_F, target_T --> batch_size, num_channels, target_F, freq_ratio, target_T/freq_ratio
+        normalized_input_features = normalized_input_features.reshape(
+            normalized_input_features.shape[0],
+            normalized_input_features.shape[1],
+            normalized_input_features.shape[2],
             self.freq_ratio,
-            normalixed_input_features.shape[3] // self.freq_ratio,
+            normalized_input_features.shape[3] // self.freq_ratio,
         )
 
-        normalixed_input_features = normalixed_input_features.permute(0, 1, 3, 2, 4).contiguous()
-        normalixed_input_features = normalixed_input_features.reshape(
-            normalixed_input_features.shape[0],
-            normalixed_input_features.shape[1],
-            normalixed_input_features.shape[2] * normalixed_input_features.shape[3],
-            normalixed_input_features.shape[4],
+        # batch_size, num_channels, target_F, freq_ratio, target_T/freq_ratio --> batch_size, num_channels, target_F, freq_ratio, target_T/freq_ratio
+        normalized_input_features = normalized_input_features.permute(0, 1, 3, 2, 4).contiguous()
+
+        # batch_size, num_channels, target_F/freq_ratio, freq_ratio, target_T/freq_ratio --> batch_size, num_channels, target_F * freq_ratio, target_T/freq_ratio
+        normalized_input_features = normalized_input_features.reshape(
+            normalized_input_features.shape[0],
+            normalized_input_features.shape[1],
+            normalized_input_features.shape[2] * normalized_input_features.shape[3],
+            normalized_input_features.shape[4],
         )
 
-        return normalixed_input_features
+        return normalized_input_features
 
     def forward(
         self,
@@ -984,15 +990,15 @@ def forward(
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, ClapAudioModelOutput]:
         input_features = input_features.transpose(1, 3)
-        normalixed_input_features = self.bn0(input_features)
-        normalixed_input_features = normalixed_input_features.transpose(1, 3)
+        normalized_input_features = self.bn0(input_features)
+        normalized_input_features = normalized_input_features.transpose(1, 3)
 
         is_longer_list_idx = None
         if self.enable_fusion:
             is_longer_list = is_longer.to(input_features.device)
             is_longer_list_idx = torch.where(is_longer_list == 1)[0]
 
-        hidden_states = self.reshape_mel2img(normalixed_input_features)
+        hidden_states = self.reshape_mel2img(normalized_input_features)
 
         frames_num = hidden_states.shape[2]
 

From 56d2b00e6768dcd707742d7203b1b6e91e4feace Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 15:00:29 +0000
Subject: [PATCH 172/197] update doc and typos

---
 docs/source/en/_toctree.yml                   |  2 +-
 src/transformers/audio_utils.py               | 24 +++++++++++++------
 .../models/clap/configuration_clap.py         |  2 +-
 .../models/clap/feature_extraction_clap.py    |  2 +-
 4 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index f30cc0a40503..9001e3eba3da 100755
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -611,7 +611,7 @@
     - local: internal/image_processing_utils
       title: Utilities for Image Processors
     - local: internal/audio_utils
-      title: Utilities for audio processing
+      title: Utilities for Audio processing
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index 8c22b135bc42..20dae32a7fca 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -142,7 +142,7 @@ def get_mel_filter_banks(
     filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
     features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
     This code is heavily inspired from the *torchaudio* implementation, see
-    [here](https://pytorch.org/audio/stable/transforms.html) for more details.
+    ![here](https://pytorch.org/audio/stable/transforms.html) for more details.
 
 
     Note:
@@ -215,7 +215,7 @@ def get_mel_filter_banks(
 def power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
     """
     Convert a mel spectrogram from power to db scale, this function is the numpy implementation of librosa.power_to_lb.
-    It computes 10 * log10(mel_spectrogram / ref), using basic log properties for stability.
+    It computes `10 * log10(mel_spectrogram / ref)`, using basic log properties for stability.
 
     Note:
         The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness on a
@@ -242,7 +242,7 @@ def power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
         log_spec = np.clip(log_spec, min=np.maximum(log_spec) - top_db, max=np.inf)
     return log_spec
 
-
+#TODO @ArthurZucker: This method does not support batching yet as we are mainly focus on inference.
 def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True):
     """
     In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
@@ -251,8 +251,6 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int =
     The window length (window_length) defines how much of the signal is contained in each frame, while the hop length
     defines the step between the beginning of each new frame.
 
-    TODO @Arthur **This method does not support batching yet as we are mainly focus on inference. If you want this to
-    be added feel free to open an issue and ping @arthurzucker on Github**
 
     Args:
         waveform (`np.array`) of shape (sample_length,):
@@ -296,12 +294,13 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int =
     frames = np.stack(frames, 0)
     return frames
 
+#TODO @ArthurZucker: This method does not support batching yet as we are mainly focus on inference.
 
 def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None):
     """
     Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
-    as `torch.stft`. TODO @Arthur batching this could allow more usage, good first issue.
-
+    as `torch.stft`. 
+    
     Args:
         frames (`np.array` of dimension `(num_frames, fft_window_size)`):
             A framed audio signal obtained using `audio_utils.fram_wav`.
@@ -316,6 +315,17 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: int =
             frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
             `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
 
+    Example:
+    
+    ```python
+    >>> from transformers.audio_utils import stft, fram_wave
+    >>> import numpy as np
+    >>> audio = np.random.rand(50)
+    >>> fft_window_size = 10; hop_length = 2
+    >>> framed_audio = fram_wave(audio, hop_length, fft_window_size)
+    >>> spectrogram = stft(framed_audio, np.hanning(fft_window_size + 1))
+    ```
+    
     Returns:
         spectrogram (`np.ndarray`):
             A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index ef2a0a6d525f..a611e377b8f0 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -199,7 +199,7 @@ class ClapAudioConfig(PretrainedConfig):
             Number of attention heads used for the Swin Layers of the audio model
         enable_fusion (`bool`, *optional*, defaults to `False`):
             Whether or not to enable patch fusion. This is the main contribution of the authors, and should give the
-            best results. Patch fusion will #TODO describe what it does
+            best results.
         hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the encoder.
         fusion_type (`[type]`, *optional*):
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index ed9d974e4154..6895a1f5b056 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -157,7 +157,7 @@ def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[n
         filter banks are used depending on the truncation pattern:
             - `self.mel_filters`: they correspond to the defaults parameters of `torchaduio` which can be obtained from
               calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
-              is set to `fuison`.
+              is set to `"fusion"`.
             - `self.mel_filteres_slaney` : they correspond to the defaults parameters of `torchlibrosa` which used
               `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original
               implementation when the truncation mode is not `"fusion"`.

From a5d7cd1c0b292421045f37f5686c5bfd21b290c8 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 13 Feb 2023 15:42:41 +0000
Subject: [PATCH 173/197] fix doctest

---
 .../models/clap/configuration_clap.py         |  3 +-
 src/transformers/models/clap/modeling_clap.py | 30 ++++++++++---------
 .../models/clap/processing_clap.py            |  2 +-
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index ef2a0a6d525f..1199c6fc3f22 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -25,7 +25,8 @@
 logger = logging.get_logger(__name__)
 
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = {
-    "laion-ai/base": "https://huggingface.co/laion-ai/base/resolve/main/config.json",
+    "laion-ai/clap-htsat-fused": "https://huggingface.co/laion-ai/clap-htsat-fused/resolve/main/config.json",
+    "laion-ai/clap-htsat-unfused": "https://huggingface.co/laion-ai/clap-htsat-unfused/resolve/main/config.json",
 }
 
 
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 581cb81e8a56..2156237a3fea 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -47,7 +47,7 @@
 
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "laion-ai/clap-htsat-fused",
-    "laion-ai/clap-htsat-unfused",
+    "ybelkada/clap-htsat-unfused",
     # See all clap models at https://huggingface.co/models?filter=clap
 ]
 
@@ -1907,8 +1907,8 @@ def forward(
         >>> dataset = load_dataset("ashraq/esc50")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
-        >>> model = ClapAudioModel.from_pretrained("laionai/clap-hsat-fused")
-        >>> processor = AutoProcessor.from_pretrained("laionai/clap-hsat-fused")
+        >>> model = ClapAudioModel.from_pretrained("ybelkada/clap-htsat-fused")
+        >>> processor = AutoProcessor.from_pretrained("ybelkada/clap-htsat-fused")
 
         >>> inputs = processor(audios=audio_sample, return_tensors="pt")
 
@@ -2159,8 +2159,8 @@ def get_text_features(
         ```python
         >>> from transformers import AutoTokenizer, ClapModel
 
-        >>> model = ClapModel.from_pretrained("laion-ai/clap-htsat-unfused")
-        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htsat-unfused")
+        >>> model = ClapModel.from_pretrained("ybelkada/clap-htsat-unfused")
+        >>> tokenizer = AutoTokenizer.from_pretrained("ybelkada/clap-htsat-unfused")
 
         >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
@@ -2236,18 +2236,18 @@ def forward(
         Examples:
 
         ```python
-        >>> from dataset import load_dataset
+        >>> from datasets import load_dataset
         >>> from transformers import AutoProcessor, ClapModel
 
         >>> dataset = load_dataset("ashraq/esc50")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
-        >>> model = ClapModel.from_pretrained("laion-ai/clap-htst-unfused-base")
-        >>> processor = AutoProcessor.from_pretrained("laion-ai/clap-htst-unfused-base")
+        >>> model = ClapModel.from_pretrained("ybelkada/clap-htsat-unfused")
+        >>> processor = AutoProcessor.from_pretrained("ybelkada/clap-htsat-unfused")
 
         >>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]
 
-        >>> inputs = processor(text=input_text, audio=audio_sample, return_tensors="pt", padding=True)
+        >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)
 
         >>> outputs = model(**inputs)
         >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
@@ -2354,8 +2354,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, ClapTextModelWithProjection
 
-        >>> model = ClapTextModelWithProjection.from_pretrained("laion-ai/clap-htsat-unfused")
-        >>> tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htsat-unfused")
+        >>> model = ClapTextModelWithProjection.from_pretrained("ybelkada/clap-htsat-unfused")
+        >>> tokenizer = AutoTokenizer.from_pretrained("ybelkada/clap-htsat-unfused")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
 
@@ -2416,6 +2416,7 @@ def get_input_embeddings(self) -> nn.Module:
     def forward(
         self,
         input_features: Optional[torch.FloatTensor] = None,
+        is_longer: Optional[torch.BoolTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -2429,13 +2430,13 @@ def forward(
         >>> from datasets import load_dataset
         >>> from transformers import ClapAudioModelWithProjection, ClapProcessor
 
-        >>> model = ClapAudioModelWithProjection.from_pretrained("laion-ai/clap-htsat-unfused")
-        >>> processor = ClapProcessor.from_pretrained("laion-ai/clap-htsat-unfused")
+        >>> model = ClapAudioModelWithProjection.from_pretrained("ybelkada/clap-htsat-fused")
+        >>> processor = ClapProcessor.from_pretrained("ybelkada/clap-htsat-fused")
 
         >>> dataset = load_dataset("ashraq/esc50")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
-        >>> inputs = processor(audio=audio_sample, return_tensors="pt")
+        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> audio_embeds = outputs.audio_embeds
         ```"""
@@ -2447,6 +2448,7 @@ def forward(
 
         audio_outputs = self.audio_model(
             input_features=input_features,
+            is_longer=is_longer,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index 4bec589d731e..a9c0c6e5b1a8 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -88,7 +88,7 @@ def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
             )
 
         if text is not None and audios is not None:
-            encoding["audio_features"] = audio_features.input_features
+            encoding["input_features"] = audio_features.input_features
             return encoding
         elif text is not None:
             return encoding

From d5376eabc3faa89f4c8030f4257aed90ff94d47a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 17:07:34 +0000
Subject: [PATCH 174/197] update modelin code, got rid of a lot of reshaping

---
 src/transformers/models/clap/modeling_clap.py | 32 ++++---------------
 1 file changed, 7 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 581cb81e8a56..96f7e0bde169 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -940,40 +940,22 @@ def reshape_mel2img(self, normalized_input_features):
         # to avoid bicubic zero error
         if time_steps < target_T:
             normalized_input_features = nn.functional.interpolate(
-                normalized_input_features,
-                (target_T, normalized_input_features.shape[3]),
-                mode="bicubic",
-                align_corners=True,
+                normalized_input_features, (target_T, freq_steps), mode="bicubic", align_corners=True
             )
         if freq_steps < target_F:
             normalized_input_features = nn.functional.interpolate(
-                normalized_input_features,
-                (normalized_input_features.shape[2], target_F),
-                mode="bicubic",
-                align_corners=True,
+                normalized_input_features, (time_steps, target_F), mode="bicubic", align_corners=True
             )
 
-        # batch_size, num_channels, target_T, target_F --> batch_size, num_channels, target_F, target_T
-        normalized_input_features = normalized_input_features.permute(0, 1, 3, 2).contiguous()
+        batch, channels, time, freq = normalized_input_features.shape
 
-        # batch_size, num_channels, target_F, target_T --> batch_size, num_channels, target_F, freq_ratio, target_T/freq_ratio
+        # batch_size, channels, target_T, target_F --> batch_size, channels, target_F * freq_ratio, target_T // freq_ratio
         normalized_input_features = normalized_input_features.reshape(
-            normalized_input_features.shape[0],
-            normalized_input_features.shape[1],
-            normalized_input_features.shape[2],
-            self.freq_ratio,
-            normalized_input_features.shape[3] // self.freq_ratio,
+            batch, channels * self.freq_ratio, time // self.freq_ratio, freq
         )
-
-        # batch_size, num_channels, target_F, freq_ratio, target_T/freq_ratio --> batch_size, num_channels, target_F, freq_ratio, target_T/freq_ratio
-        normalized_input_features = normalized_input_features.permute(0, 1, 3, 2, 4).contiguous()
-
-        # batch_size, num_channels, target_F/freq_ratio, freq_ratio, target_T/freq_ratio --> batch_size, num_channels, target_F * freq_ratio, target_T/freq_ratio
+        normalized_input_features = normalized_input_features.permute(0, 1, 3, 2).contiguous()
         normalized_input_features = normalized_input_features.reshape(
-            normalized_input_features.shape[0],
-            normalized_input_features.shape[1],
-            normalized_input_features.shape[2] * normalized_input_features.shape[3],
-            normalized_input_features.shape[4],
+            batch, channels, freq * self.freq_ratio, time // self.freq_ratio
         )
 
         return normalized_input_features

From f4b0441cd9492b9600ef586adc84439133ab7ef5 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 17:07:53 +0000
Subject: [PATCH 175/197] style on added doc audio utils

---
 src/transformers/audio_utils.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index 20dae32a7fca..50c80c03a476 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -242,7 +242,8 @@ def power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
         log_spec = np.clip(log_spec, min=np.maximum(log_spec) - top_db, max=np.inf)
     return log_spec
 
-#TODO @ArthurZucker: This method does not support batching yet as we are mainly focus on inference.
+
+# TODO @ArthurZucker: This method does not support batching yet as we are mainly focus on inference.
 def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True):
     """
     In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
@@ -294,13 +295,15 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int =
     frames = np.stack(frames, 0)
     return frames
 
-#TODO @ArthurZucker: This method does not support batching yet as we are mainly focus on inference.
+
+# TODO @ArthurZucker: This method does not support batching yet as we are mainly focus on inference.
+
 
 def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None):
     """
     Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
-    as `torch.stft`. 
-    
+    as `torch.stft`.
+
     Args:
         frames (`np.array` of dimension `(num_frames, fft_window_size)`):
             A framed audio signal obtained using `audio_utils.fram_wav`.
@@ -316,16 +319,18 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: int =
             `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
 
     Example:
-    
+
     ```python
     >>> from transformers.audio_utils import stft, fram_wave
     >>> import numpy as np
+
     >>> audio = np.random.rand(50)
-    >>> fft_window_size = 10; hop_length = 2
+    >>> fft_window_size = 10
+    >>> hop_length = 2
     >>> framed_audio = fram_wave(audio, hop_length, fft_window_size)
     >>> spectrogram = stft(framed_audio, np.hanning(fft_window_size + 1))
     ```
-    
+
     Returns:
         spectrogram (`np.ndarray`):
             A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm

From ad8219825454ac01fabc78886d2a2a19129ce73c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 17:30:44 +0000
Subject: [PATCH 176/197] update modeling clap

---
 src/transformers/models/clap/modeling_clap.py | 22 ++++++-------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index e19f4530068c..76235dc3c48d 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -411,25 +411,17 @@ def forward(self, hidden_states, is_longer_idx=None):
                 # local processing
                 local_hidden_states = hidden_states[is_longer_idx, 1:, :, :].contiguous()
                 batch_size, num_channels, height, width = local_hidden_states.shape
-
                 local_hidden_states = local_hidden_states.view(batch_size * num_channels, 1, height, width)
+                
                 local_hidden_states = self.mel_conv2d(local_hidden_states)
-                local_hidden_states = local_hidden_states.view(
-                    batch_size,
-                    num_channels,
-                    local_hidden_states.size(1),
-                    local_hidden_states.size(2),
-                    local_hidden_states.size(3),
-                )
+                
+                _, features, height, width = local_hidden_states.shape
+                local_hidden_states = local_hidden_states.view(batch_size,num_channels,features,height,width)
                 local_hidden_states = local_hidden_states.permute((0, 2, 3, 1, 4)).contiguous().flatten(3)
-
+                
                 local_width = local_hidden_states.size(-1)
-                if local_width < output_width:
-                    local_hidden_states = torch.nn.functional.pad(
-                        local_hidden_states, (0, output_width - local_width), "constant", 0
-                    )
-                else:
-                    local_hidden_states = local_hidden_states[:, :, :, :output_width]
+                local_hidden_states = torch.nn.functional.pad(local_hidden_states, (0, output_width - local_width), "constant", 0)
+
 
                 global_hidden_states[is_longer_idx] = self.fusion_model(
                     global_hidden_states[is_longer_idx], local_hidden_states

From 6ed76faecd6f8b81e2b267cc1907953e33ac824b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 13 Feb 2023 17:31:12 +0000
Subject: [PATCH 177/197] style

---
 src/transformers/models/clap/modeling_clap.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 76235dc3c48d..620fd17768db 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -412,16 +412,17 @@ def forward(self, hidden_states, is_longer_idx=None):
                 local_hidden_states = hidden_states[is_longer_idx, 1:, :, :].contiguous()
                 batch_size, num_channels, height, width = local_hidden_states.shape
                 local_hidden_states = local_hidden_states.view(batch_size * num_channels, 1, height, width)
-                
+
                 local_hidden_states = self.mel_conv2d(local_hidden_states)
-                
+
                 _, features, height, width = local_hidden_states.shape
-                local_hidden_states = local_hidden_states.view(batch_size,num_channels,features,height,width)
+                local_hidden_states = local_hidden_states.view(batch_size, num_channels, features, height, width)
                 local_hidden_states = local_hidden_states.permute((0, 2, 3, 1, 4)).contiguous().flatten(3)
-                
-                local_width = local_hidden_states.size(-1)
-                local_hidden_states = torch.nn.functional.pad(local_hidden_states, (0, output_width - local_width), "constant", 0)
 
+                local_width = local_hidden_states.size(-1)
+                local_hidden_states = torch.nn.functional.pad(
+                    local_hidden_states, (0, output_width - local_width), "constant", 0
+                )
 
                 global_hidden_states[is_longer_idx] = self.fusion_model(
                     global_hidden_states[is_longer_idx], local_hidden_states

From af97adec17b37c9d09331b9c1fa88672e7f11bb7 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 14 Feb 2023 03:28:14 +0100
Subject: [PATCH 178/197] Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/internal/audio_utils.mdx                |  2 +-
 src/transformers/audio_utils.py                        |  8 ++++----
 src/transformers/models/clap/configuration_clap.py     |  2 +-
 .../models/clap/feature_extraction_clap.py             |  8 ++++----
 src/transformers/models/clap/modeling_clap.py          | 10 +++++-----
 src/transformers/models/clap/processing_clap.py        |  2 +-
 6 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/internal/audio_utils.mdx b/docs/source/en/internal/audio_utils.mdx
index 9c603b605777..8f1d6597149d 100644
--- a/docs/source/en/internal/audio_utils.mdx
+++ b/docs/source/en/internal/audio_utils.mdx
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # Utilities for `FeatureExtractors`
 
-This page lists all the utility functions that can be used by the audio `FeatureExtractor` in order to compute special features from a raw audio using comon algorightms such as `Short Time Fourier Transform` or `Mel log spectrogram`.
+This page lists all the utility functions that can be used by the audio [`FeatureExtractor`] in order to compute special features from a raw audio using common algorithms such as *Short Time Fourier Transform* or *Mel log spectrogram*.
 
 
 Most of those are only useful if you are studying the code of the image processors in the library.
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index 50c80c03a476..379a79bc749c 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -142,7 +142,7 @@ def get_mel_filter_banks(
     filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
     features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
     This code is heavily inspired from the *torchaudio* implementation, see
-    ![here](https://pytorch.org/audio/stable/transforms.html) for more details.
+    [here](https://pytorch.org/audio/stable/transforms.html) for more details.
 
 
     Note:
@@ -171,8 +171,8 @@ def get_mel_filter_banks(
             Sample rate of the audio waveform.
         norm (`str`, *optional*):
             If "slaney", divide the triangular Mel weights by the width of the mel band (area normalization).
-        mel_scale (`str`, *optional*, `"htk"`):
-            Scale to use: `htk` or `slaney`. (Default: `htk`)
+        mel_scale (`str`, *optional*, defaults to `"htk"`):
+            Scale to use: `"htk"` or `"slaney"`.
 
     Returns:
         `np.ndarray`: Triangular filter banks (fb matrix) of shape (`nb_frequency_bins`, `nb_mel_filters`). This matrix
@@ -254,7 +254,7 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int =
 
 
     Args:
-        waveform (`np.array`) of shape (sample_length,):
+        waveform (`np.array` of shape `(sample_length,)`):
             The raw waveform which will be split into smaller chunks.
         hop_length (`int`, *optional*, defaults to 160):
             Step between each window of the waveform.
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 24879afebc6e..7b608db49dcd 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Clap model configuration"""
+""" CLAP model configuration"""
 
 import copy
 import os
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 6895a1f5b056..a43549318821 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for Clap."""
+"""Feature extractor class for CLAP."""
 
 
 import copy
@@ -32,13 +32,13 @@
 
 class ClapFeatureExtractor(SequenceFeatureExtractor):
     r"""
-    Constructs a Clap feature extractor.
+    Constructs a CLAP feature extractor.
 
     This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
     most of the main methods. Users should refer to this superclass for more information regarding those methods.
 
-    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
-    Fourier Transform` (STFT) which should match pytorch's `torch.stft` equivalent.
+    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the *Short Time
+    Fourier Transform* (STFT) which should match pytorch's `torch.stft` equivalent.
 
     Args:
         feature_size (`int`, defaults to 64):
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 620fd17768db..c21c53a53792 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Clap model."""
+""" PyTorch CLAP model."""
 import collections
 import math
 from dataclasses import dataclass
@@ -1099,7 +1099,7 @@ def custom_forward(*inputs):
         )
 
 
-Clap_START_DOCSTRING = r"""
+CLAP_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -1114,7 +1114,7 @@ def custom_forward(*inputs):
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-Clap_TEXT_INPUTS_DOCSTRING = r"""
+CLAP_TEXT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -2290,7 +2290,7 @@ def forward(
 
 @add_start_docstrings(
     """
-    Clap Text Model with a projection layer on top (a linear layer on top of the pooled output).
+    CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
     """,
     Clap_START_DOCSTRING,
 )
@@ -2366,7 +2366,7 @@ def forward(
 
 @add_start_docstrings(
     """
-    Clap Audio Model with a projection layer on top (a linear layer on top of the pooled output).
+    CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output).
     """,
     Clap_START_DOCSTRING,
 )
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index a9c0c6e5b1a8..ea9185259771 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Audio/Text processor class for Clap
+Audio/Text processor class for CLAP
 """
 
 from ...processing_utils import ProcessorMixin

From c5749dac85dd3bb6eb15962e4ab013d3247d493a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 14 Feb 2023 09:50:07 +0000
Subject: [PATCH 179/197] docstringvariables with CLAP

---
 src/transformers/models/clap/modeling_clap.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index c21c53a53792..d69a98ee4fe7 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -2074,7 +2074,7 @@ def forward(
         )
 
 
-@add_start_docstrings(Clap_START_DOCSTRING)
+@add_start_docstrings(CLAP_START_DOCSTRING)
 class ClapModel(ClapPreTrainedModel):
     config_class = ClapConfig
 
@@ -2114,7 +2114,7 @@ def __init__(self, config: ClapConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(Clap_TEXT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
     def get_text_features(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -2292,7 +2292,7 @@ def forward(
     """
     CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
     """,
-    Clap_START_DOCSTRING,
+    CLAP_START_DOCSTRING,
 )
 class ClapTextModelWithProjection(ClapPreTrainedModel):
     config_class = ClapTextConfig
@@ -2310,7 +2310,7 @@ def get_input_embeddings(self) -> nn.Module:
     def set_input_embeddings(self, value):
         self.text_model.embeddings.word_embeddings = value
 
-    @add_start_docstrings_to_model_forward(Clap_TEXT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=ClapTextModelOutput, config_class=ClapTextConfig)
     def forward(
         self,
@@ -2368,7 +2368,7 @@ def forward(
     """
     CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output).
     """,
-    Clap_START_DOCSTRING,
+    CLAP_START_DOCSTRING,
 )
 class ClapAudioModelWithProjection(ClapPreTrainedModel):
     config_class = ClapAudioConfig

From fd0bd4726f7125a26aff4cee9a21e592049bad6d Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 14 Feb 2023 09:55:48 +0000
Subject: [PATCH 180/197] rename key

---
 .../models/clap/convert_clap_original_pytorch_to_hf.py        | 1 +
 src/transformers/models/clap/modeling_clap.py                 | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
index 21f79face787..528c48aea2fc 100644
--- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+++ b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
@@ -32,6 +32,7 @@
     "mlp.fc2": "output.dense",
     "norm1": "layernorm_before",
     "norm2": "layernorm_after",
+    "bn0": "batch_norm",
 }
 
 processor = AutoFeatureExtractor.from_pretrained("ybelkada/clap-htsat-unfused", truncation="rand_trunc")
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index d69a98ee4fe7..d2825649a6ec 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -901,7 +901,7 @@ def __init__(self, config):
 
         self.gradient_checkpointing = False
 
-        self.bn0 = nn.BatchNorm2d(config.num_mel_bins)
+        self.batch_norm = nn.BatchNorm2d(config.num_mel_bins)
         self.norm = nn.LayerNorm(self.num_features)
         self.depths = config.depths
 
@@ -965,7 +965,7 @@ def forward(
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, ClapAudioModelOutput]:
         input_features = input_features.transpose(1, 3)
-        normalized_input_features = self.bn0(input_features)
+        normalized_input_features = self.batch_norm(input_features)
         normalized_input_features = normalized_input_features.transpose(1, 3)
 
         is_longer_list_idx = None

From 13f41eb47f79c0fadf0a11fb3941a0e9e82f4a16 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 14 Feb 2023 09:57:15 +0000
Subject: [PATCH 181/197] update modeling CLAP

---
 src/transformers/models/clap/modeling_clap.py | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index d69a98ee4fe7..936e8e837577 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -58,14 +58,14 @@ def interpolate(hidden_states, ratio):
     Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.
 
     Args:
-        hidden_states (`torch.FloatTensor` of shape (batch_size, time_steps, classes_num)):
+        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
             Input hidden states
         ratio (`int`):
             The ratio of the length of the output to the length of the input.
     """
-    (batch_size, time_steps, classes_num) = hidden_states.shape
+    (batch_size, time_length, classes_num) = hidden_states.shape
     upsampled = hidden_states[:, :, None, :].repeat(1, 1, ratio, 1)
-    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+    upsampled = upsampled.reshape(batch_size, time_length * ratio, classes_num)
     return upsampled
 
 
@@ -922,27 +922,27 @@ def reshape_mel2img(self, normalized_input_features):
         The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
         should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
         """
-        _, _, time_steps, freq_steps = normalized_input_features.shape
+        _, _, time_length, freq_length = normalized_input_features.shape
 
-        target_T = int(self.spec_size * self.freq_ratio)
-        target_F = self.spec_size // self.freq_ratio
+        spec_width = int(self.spec_size * self.freq_ratio)
+        spec_heigth = self.spec_size // self.freq_ratio
 
-        if time_steps > target_T or freq_steps > target_F:
+        if time_length > spec_width or freq_length > spec_heigth:
             raise ValueError("the wav size should be less than or equal to the swin input size")
 
         # to avoid bicubic zero error
-        if time_steps < target_T:
+        if time_length < spec_width:
             normalized_input_features = nn.functional.interpolate(
-                normalized_input_features, (target_T, freq_steps), mode="bicubic", align_corners=True
+                normalized_input_features, (spec_width, freq_length), mode="bicubic", align_corners=True
             )
-        if freq_steps < target_F:
+        if freq_length < spec_heigth:
             normalized_input_features = nn.functional.interpolate(
-                normalized_input_features, (time_steps, target_F), mode="bicubic", align_corners=True
+                normalized_input_features, (time_length, spec_heigth), mode="bicubic", align_corners=True
             )
 
         batch, channels, time, freq = normalized_input_features.shape
 
-        # batch_size, channels, target_T, target_F --> batch_size, channels, target_F * freq_ratio, target_T // freq_ratio
+        # batch_size, channels, spec_width, spec_heigth --> batch_size, channels, spec_heigth * freq_ratio, spec_width // freq_ratio
         normalized_input_features = normalized_input_features.reshape(
             batch, channels * self.freq_ratio, time // self.freq_ratio, freq
         )

From 2655e136e9055ec187d572ec7f3e5ac5a88ae39a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 14 Feb 2023 10:03:14 +0000
Subject: [PATCH 182/197] update audio utils docstring

---
 src/transformers/audio_utils.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index 379a79bc749c..73bc041d6961 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -145,18 +145,18 @@ def get_mel_filter_banks(
     [here](https://pytorch.org/audio/stable/transforms.html) for more details.
 
 
-    Note:
-        Different banks of Mel filters were introduced in the litterature. The following variation are supported:
+    Tips:
+        - Different banks of Mel filters were introduced in the litterature. The following variation are supported:
             - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHertz
-                and a speech bandwidth of `[0, 4600]` Hertz
+            and a speech bandwidth of `[0, 4600]` Hertz
             - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a
-                speech bandwidth `[0, 8000]` Hertz (sampling rate ≥ 16 kHertz).
+            speech bandwidth `[0, 8000]` Hertz (sampling rate ≥ 16 kHertz).
             - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate
-                of 16 kHertz, and speech bandwidth [133, 6854] Hertz. This version also includes an area normalization.
+            of 16 kHertz, and speech bandwidth [133, 6854] Hertz. This version also includes an area normalization.
             - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes sampling
-                rate of 12.5 kHertz and speech bandwidth [0, 6250] Hertz
-        The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa` uses
-        the `"slaney"` implementation.
+            rate of 12.5 kHertz and speech bandwidth [0, 6250] Hertz
+        - The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa`
+        uses the `"slaney"` implementation.
 
     Args:
         nb_frequency_bins (`int`):
@@ -217,10 +217,12 @@ def power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
     Convert a mel spectrogram from power to db scale, this function is the numpy implementation of librosa.power_to_lb.
     It computes `10 * log10(mel_spectrogram / ref)`, using basic log properties for stability.
 
-    Note:
-        The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness on a
+    Tips:
+        - The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness on
+          a
         linear scale. Generally to double the percieved volume of a sound we need to put 8 times as much energy into
-        it. This means that large variations in energy may not sound all that different if the sound is loud to begin
+        it.
+        - This means that large variations in energy may not sound all that different if the sound is loud to begin
         with. This compression operation makes the mel features match more closely what humans actually hear.
 
     Args:

From dbe50a7f07b895f3fb6fdeeccd0a4630fcb20a9b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 14 Feb 2023 10:07:06 +0000
Subject: [PATCH 183/197] update processing clap

---
 src/transformers/models/clap/processing_clap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index ea9185259771..7492f102b4b2 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -22,7 +22,7 @@
 
 class ClapProcessor(ProcessorMixin):
     r"""
-    Constructs a Clap processor which wraps a Clap feature extractor and a Clap tokenizer into a single processor.
+    Constructs a CLAP processor which wraps a CLAP feature extractor and a RoBerta tokenizer into a single processor.
 
     [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`RobertaTokenizerFast`]. See the
     [`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information.

From 34255aa73ee27c1bc25dc15cb7f9ac0c565c639d Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 14 Feb 2023 10:08:22 +0000
Subject: [PATCH 184/197] fix readmes

---
 README.md                | 3 ++-
 README_es.md             | 1 +
 README_hd.md             | 1 +
 README_ja.md             | 1 +
 README_ko.md             | 1 +
 README_zh-hans.md        | 1 +
 README_zh-hant.md        | 1 +
 docs/source/en/index.mdx | 2 +-
 8 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b55d5023b665..bacf158a0cf5 100644
--- a/README.md
+++ b/README.md
@@ -295,7 +295,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_es.md b/README_es.md
index 9201a11110fb..0eb5ca9b3fbb 100644
--- a/README_es.md
+++ b/README_es.md
@@ -289,6 +289,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_hd.md b/README_hd.md
index 64bb763ca077..94195eaf0d6e 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -261,6 +261,7 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा।
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा।
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (सेल्सफोर्स से) साथ में पेपर [प्रोग्राम सिंथेसिस के लिए एक संवादात्मक प्रतिमान](https://arxiv.org/abs/2203.13474) एरिक निजकैंप, बो पैंग, हिरोआकी हयाशी, लिफू तू, हुआन वांग, यिंगबो झोउ, सिल्वियो सावरेस, कैमिंग जिओंग रिलीज।
diff --git a/README_ja.md b/README_ja.md
index ff745247d7b8..970b8a194a5a 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -323,6 +323,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
 1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)
diff --git a/README_ko.md b/README_ko.md
index 83af0bc684d6..42c99ad44ae0 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -238,6 +238,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
 1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce 에서) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 의 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index a3ed275f5fe2..03fb67b7af75 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -262,6 +262,7 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
 1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 84da51d06dad..4aeee801b9be 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -274,6 +274,7 @@ conda install -c huggingface transformers
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 2087140f281f..19c3dfea350a 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -74,7 +74,7 @@ The documentation is organized into five sections:
 1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[Clap](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[CLAP](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.

From de162ebcdee182e35d0d795bcb6a317b75b6d116 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 14 Feb 2023 10:09:20 +0000
Subject: [PATCH 185/197] fix toctree

---
 docs/source/en/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 9001e3eba3da..a2822bfc974a 100755
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -484,7 +484,7 @@
       - local: model_doc/audio-spectrogram-transformer
         title: Audio Spectrogram Transformer
       - local: model_doc/clap
-        title: Clap
+        title: CLAP
       - local: model_doc/hubert
         title: Hubert
       - local: model_doc/mctct

From 83d07162acc3cbe8f3e7e5be291fd07b71ec3933 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 14 Feb 2023 10:09:40 +0000
Subject: [PATCH 186/197] udpate configuration clap

---
 src/transformers/models/clap/configuration_clap.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 7b608db49dcd..9ee5d6855d5e 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -165,8 +165,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 class ClapAudioConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`ClapAudioModel`]. It is used to instantiate a
-    Clap audio encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the audio encoder of the Clap
+    CLAP audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of the CLAP
     [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -332,8 +332,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 class ClapConfig(PretrainedConfig):
     r"""
     [`ClapConfig`] is the configuration class to store the configuration of a [`ClapModel`]. It is used to instantiate
-    a Clap model according to the specified arguments, defining the text model and audio model configs. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the Clap
+    a CLAP model according to the specified arguments, defining the text model and audio model configs. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the CLAP
     [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -347,7 +347,7 @@ class ClapConfig(PretrainedConfig):
         projection_dim (`int`, *optional*, defaults to 512):
             Dimentionality of text and audio projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
-            The inital value of the *logit_scale* paramter. Default is used as per the original Clap implementation.
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLAP implementation.
         fusion_num_hidden_layers (`int`, *optional*, defaults to 2):
             Number of hidden layers in the fusion layer.
         projection_dim (`int`, *optional*, defaults to 512):

From 230b5167942da06526c479ffdfd28b4892759bfb Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 14 Feb 2023 10:14:57 +0000
Subject: [PATCH 187/197] fix init

---
 src/transformers/models/clap/modeling_clap.py | 55 +++----------------
 1 file changed, 8 insertions(+), 47 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 016c8a302352..5bcff311e888 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1785,62 +1785,23 @@ def _init_weights(self, module):
         factor = self.config.initializer_factor
 
         if isinstance(module, ClapTextEmbeddings):
-            module.word_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
             module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
             module.token_type_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
-        elif isinstance(module, ClapTextSelfAttention):
-            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
-            nn.init.normal_(module.query.weight, std=in_proj_std)
-            nn.init.normal_(module.key.weight, std=in_proj_std)
-            nn.init.normal_(module.value.weight, std=in_proj_std)
-        elif isinstance(
-            module,
-            (
-                ClapTextSelfOutput,
-                ClapTextOutput,
-                ClapTextIntermediate,
-                ClapTextPooler,
-                ClapAudioSelfOutput,
-                ClapAudioIntermediate,
-                ClapAudioOutput,
-            ),
-        ):
-            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
-            nn.init.normal_(module.dense.weight, std=in_proj_std)
-        elif isinstance(module, ClapProjectionLayer):
-            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
-            nn.init.normal_(module.linear1.weight, std=in_proj_std)
-            nn.init.normal_(module.linear2.weight, std=in_proj_std)
-        elif isinstance(module, ClapAudioPatchEmbed):
-            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
-            nn.init.normal_(module.proj.weight, std=in_proj_std)
-        elif isinstance(module, ClapAudioSelfAttention):
-            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
-            nn.init.normal_(module.query.weight, std=in_proj_std)
-            nn.init.normal_(module.key.weight, std=in_proj_std)
-            nn.init.normal_(module.value.weight, std=in_proj_std)
-        elif isinstance(module, ClapAudioPatchMerging):
-            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
-            nn.init.normal_(module.reduction.weight, std=in_proj_std)
-        elif isinstance(module, ClapAudioEncoder):
-            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
-            nn.init.normal_(module.head.weight, std=in_proj_std)
-        elif isinstance(module, ClapFusionBlock):
-            nn.init.normal_(module.linear.weight, std=factor * 0.02)
+        elif isinstance(module, ClapModel):
+            nn.init.normal_(module.logit_scale_a, std=factor * 0.02)
+            nn.init.normal_(module.logit_scale_t, std=factor * 0.02)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=factor * 0.02)
 
-        if isinstance(module, nn.LayerNorm):
+        elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-        if isinstance(module, nn.Conv2d):
+        elif isinstance(module, (nn.Conv2d, nn.Linear)):
             in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
             nn.init.normal_(module.weight, std=in_proj_std)
             if module.bias is not None:
                 module.bias.data.zero_()
-        if isinstance(module, ClapModel):
-            nn.init.normal_(module.logit_scale_a, std=factor * 0.02)
-            nn.init.normal_(module.logit_scale_t, std=factor * 0.02)
+        
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, ClapTextEncoder):

From f379031f5c316a984598f870bd11f225d754b246 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 14 Feb 2023 10:15:12 +0000
Subject: [PATCH 188/197] make fixup

---
 src/transformers/models/clap/modeling_clap.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 5bcff311e888..3da1c2dc845b 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1801,7 +1801,6 @@ def _init_weights(self, module):
             nn.init.normal_(module.weight, std=in_proj_std)
             if module.bias is not None:
                 module.bias.data.zero_()
-        
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, ClapTextEncoder):

From fe1fbe32d34e873d68b2a7c24d4c18531a2384a2 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 14 Feb 2023 10:22:28 +0000
Subject: [PATCH 189/197] fix

---
 README.md                | 1 -
 README_es.md             | 1 -
 README_hd.md             | 1 -
 README_ja.md             | 1 -
 README_ko.md             | 1 -
 README_zh-hans.md        | 1 -
 docs/source/en/index.mdx | 2 +-
 7 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/README.md b/README.md
index bacf158a0cf5..7244b380f680 100644
--- a/README.md
+++ b/README.md
@@ -296,7 +296,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_es.md b/README_es.md
index 0eb5ca9b3fbb..e046ffb4a073 100644
--- a/README_es.md
+++ b/README_es.md
@@ -288,7 +288,6 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
diff --git a/README_hd.md b/README_hd.md
index 94195eaf0d6e..8959088672b1 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -260,7 +260,6 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (इनरिया/फेसबुक/सोरबोन से) साथ में कागज [CamemBERT: एक टेस्टी फ्रेंच लैंग्वेज मॉडल](https:// arxiv.org/abs/1911.03894) लुई मार्टिन*, बेंजामिन मुलर*, पेड्रो जेवियर ऑर्टिज़ सुआरेज़*, योआन ड्यूपॉन्ट, लॉरेंट रोमरी, एरिक विलेमोन्टे डे ला क्लर्जरी, जैमे सेडाह और बेनोइट सगोट द्वारा।
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा।
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा।
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
diff --git a/README_ja.md b/README_ja.md
index 970b8a194a5a..0a56df93c138 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -322,7 +322,6 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
diff --git a/README_ko.md b/README_ko.md
index 42c99ad44ae0..ee2f4852af89 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -237,7 +237,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne 에서) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 의 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 논문과 함께 발표했습니다.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 03fb67b7af75..23543027caba 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -261,7 +261,6 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 19c3dfea350a..7b18da09a1df 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -261,7 +261,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             Clap              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CLAP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |

From 75171e35df0a7f43d723bf5036f6e5d9c3bee673 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 14 Feb 2023 10:22:35 +0000
Subject: [PATCH 190/197] fix

---
 README_zh-hant.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README_zh-hant.md b/README_zh-hant.md
index 4aeee801b9be..860e6e008bbf 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -273,7 +273,6 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.

From 2fba86bf0428d2c5906d740204cddd554e495059 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 14 Feb 2023 10:28:22 +0000
Subject: [PATCH 191/197] update naming

---
 README.md                                          |  1 -
 README_es.md                                       |  1 -
 README_hd.md                                       |  1 -
 README_ja.md                                       |  1 -
 README_ko.md                                       |  1 -
 README_zh-hans.md                                  |  1 -
 README_zh-hant.md                                  |  1 -
 docs/source/en/index.mdx                           |  2 +-
 src/transformers/models/auto/configuration_auto.py |  2 +-
 .../clap/convert_clap_original_pytorch_to_hf.py    |  2 +-
 .../models/clap/feature_extraction_clap.py         |  2 +-
 src/transformers/models/clap/modeling_clap.py      | 14 +++++++-------
 tests/models/clap/test_modeling_clap.py            |  8 ++++----
 13 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index bacf158a0cf5..7244b380f680 100644
--- a/README.md
+++ b/README.md
@@ -296,7 +296,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
diff --git a/README_es.md b/README_es.md
index 0eb5ca9b3fbb..e046ffb4a073 100644
--- a/README_es.md
+++ b/README_es.md
@@ -288,7 +288,6 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
diff --git a/README_hd.md b/README_hd.md
index 94195eaf0d6e..8959088672b1 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -260,7 +260,6 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (इनरिया/फेसबुक/सोरबोन से) साथ में कागज [CamemBERT: एक टेस्टी फ्रेंच लैंग्वेज मॉडल](https:// arxiv.org/abs/1911.03894) लुई मार्टिन*, बेंजामिन मुलर*, पेड्रो जेवियर ऑर्टिज़ सुआरेज़*, योआन ड्यूपॉन्ट, लॉरेंट रोमरी, एरिक विलेमोन्टे डे ला क्लर्जरी, जैमे सेडाह और बेनोइट सगोट द्वारा।
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा।
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा।
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
diff --git a/README_ja.md b/README_ja.md
index 970b8a194a5a..0a56df93c138 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -322,7 +322,6 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
diff --git a/README_ko.md b/README_ko.md
index 42c99ad44ae0..ee2f4852af89 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -237,7 +237,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne 에서) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 의 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 논문과 함께 발표했습니다.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 03fb67b7af75..23543027caba 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -261,7 +261,6 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 4aeee801b9be..860e6e008bbf 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -273,7 +273,6 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[Clap](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 19c3dfea350a..7b18da09a1df 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -261,7 +261,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             Clap              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CLAP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 3fa1fec59470..64521eb47a75 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -381,7 +381,7 @@
         ("camembert", "CamemBERT"),
         ("canine", "CANINE"),
         ("chinese_clip", "Chinese-CLIP"),
-        ("clap", "Clap"),
+        ("clap", "CLAP"),
         ("clip", "CLIP"),
         ("clipseg", "CLIPSeg"),
         ("codegen", "CodeGen"),
diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
index 528c48aea2fc..b3b22635f7bd 100644
--- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+++ b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
@@ -71,7 +71,7 @@ def rename_state_dict(state_dict):
         elif re.match(text_projection_pattern, key):
             projecton_layer = int(re.match(text_projection_pattern, key).group(1))
 
-            # Because in Clap they use `nn.Sequential`...
+            # Because in CLAP they use `nn.Sequential`...
             transformers_projection_layer = 1 if projecton_layer == 0 else 2
 
             key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index a43549318821..f224feb50e49 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -153,7 +153,7 @@ def to_dict(self) -> Dict[str, Any]:
 
     def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
         """
-        Compute the log-Mel spectrogram of the provided `waveform` using the `hanning` window. In Clap, two different
+        Compute the log-Mel spectrogram of the provided `waveform` using the `hanning` window. In CLAP, two different
         filter banks are used depending on the truncation pattern:
             - `self.mel_filters`: they correspond to the defaults parameters of `torchaduio` which can be obtained from
               calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 016c8a302352..d41257ad9873 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -52,7 +52,7 @@
 ]
 
 
-# Adapted from: https://github.com/LAION-AI/Clap/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L191
+# Adapted from: https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L191
 def interpolate(hidden_states, ratio):
     """
     Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.
@@ -69,7 +69,7 @@ def interpolate(hidden_states, ratio):
     return upsampled
 
 
-# Adapted from https://github.com/LAION-AI/Clap/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L249
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L249
 def window_partition(hidden_states, window_size):
     """
     Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
@@ -90,7 +90,7 @@ def window_partition(hidden_states, window_size):
     return windows
 
 
-# Adapted from https://github.com/LAION-AI/Clap/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L263
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L263
 def window_reverse(windows, window_size, height, width):
     """
     Args:
@@ -304,10 +304,10 @@ def forward(self, hidden_states):
         return output
 
 
-# Adapted from https://github.com/LAION-AI/Clap/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133
 class ClapAudioAFFBlock(nn.Module):
     r"""
-    AFF Block from Clap, since in Clap we are always in 2D mode, it is not needed to implement the 1D version.
+    AFF Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement the 1D version.
     """
 
     def __init__(self, config: ClapAudioConfig):
@@ -2140,7 +2140,7 @@ def get_text_features(
         >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
         ```"""
-        # Use Clap model's config for some fields (if specified) instead of those of audio & text components.
+        # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -2228,7 +2228,7 @@ def forward(
         >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
         >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
         ```"""
-        # Use Clap model's config for some fields (if specified) instead of those of audio & text components.
+        # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 20acefaf6fde..b711669dbd4b 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Clap model. """
+""" Testing suite for the PyTorch CLAP model. """
 
 
 import inspect
@@ -161,7 +161,7 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class ClapAudioModelTest(ModelTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Clap does not use input_ids, inputs_embeds,
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLAP does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
@@ -516,7 +516,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_common_attributes(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for Clap
+    # override as the `logit_scale` parameter initilization is different for CLAP
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -554,7 +554,7 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             try:
                 input_ids = inputs_dict["input_ids"]
-                input_features = inputs_dict["input_features"]  # Clap needs input_features
+                input_features = inputs_dict["input_features"]  # CLAP needs input_features
                 traced_model = torch.jit.trace(model, (input_ids, input_features))
             except RuntimeError:
                 self.fail("Couldn't trace module.")

From c221e1d34cefeadf0f980c91990affef1958de83 Mon Sep 17 00:00:00 2001
From: "arthur.zucker@gmail.com" <arthur.zucker@gmail.com>
Date: Thu, 16 Feb 2023 16:22:38 +0000
Subject: [PATCH 192/197] update

---
 .../models/clap/convert_clap_original_pytorch_to_hf.py          | 2 +-
 tests/models/clap/test_processor_clap.py                        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
index b3b22635f7bd..908fef5927af 100644
--- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+++ b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
@@ -35,7 +35,7 @@
     "bn0": "batch_norm",
 }
 
-processor = AutoFeatureExtractor.from_pretrained("ybelkada/clap-htsat-unfused", truncation="rand_trunc")
+processor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused", truncation="rand_trunc")
 
 
 def init_clap(checkpoint_path, enable_fusion=False):
diff --git a/tests/models/clap/test_processor_clap.py b/tests/models/clap/test_processor_clap.py
index 026a5f9189e3..49e9972ea02e 100644
--- a/tests/models/clap/test_processor_clap.py
+++ b/tests/models/clap/test_processor_clap.py
@@ -26,7 +26,7 @@
 @require_sentencepiece
 class ClapProcessorTest(unittest.TestCase):
     def setUp(self):
-        self.checkpoint = "ybelkada/clap-htsat-unfused"
+        self.checkpoint = "laion/clap-htsat-unfused"
         self.tmpdirname = tempfile.mkdtemp()
 
     def get_tokenizer(self, **kwargs):

From 25610ceb50a9e9293df661623c0396028a11ab3d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 14 Feb 2023 10:32:50 +0000
Subject: [PATCH 193/197] update checkpoint path

---
 src/transformers/models/clap/modeling_clap.py | 22 +++++++++----------
 tests/models/clap/test_modeling_clap.py       |  4 ++--
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 63b5fefffc79..f7902069aa6e 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -47,7 +47,7 @@
 
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "laion-ai/clap-htsat-fused",
-    "ybelkada/clap-htsat-unfused",
+    "laion/clap-htsat-unfused",
     # See all clap models at https://huggingface.co/models?filter=clap
 ]
 
@@ -1842,8 +1842,8 @@ def forward(
         >>> dataset = load_dataset("ashraq/esc50")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
-        >>> model = ClapAudioModel.from_pretrained("ybelkada/clap-htsat-fused")
-        >>> processor = AutoProcessor.from_pretrained("ybelkada/clap-htsat-fused")
+        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
+        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")
 
         >>> inputs = processor(audios=audio_sample, return_tensors="pt")
 
@@ -2094,8 +2094,8 @@ def get_text_features(
         ```python
         >>> from transformers import AutoTokenizer, ClapModel
 
-        >>> model = ClapModel.from_pretrained("ybelkada/clap-htsat-unfused")
-        >>> tokenizer = AutoTokenizer.from_pretrained("ybelkada/clap-htsat-unfused")
+        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
 
         >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
@@ -2177,8 +2177,8 @@ def forward(
         >>> dataset = load_dataset("ashraq/esc50")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
-        >>> model = ClapModel.from_pretrained("ybelkada/clap-htsat-unfused")
-        >>> processor = AutoProcessor.from_pretrained("ybelkada/clap-htsat-unfused")
+        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")
 
         >>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]
 
@@ -2289,8 +2289,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, ClapTextModelWithProjection
 
-        >>> model = ClapTextModelWithProjection.from_pretrained("ybelkada/clap-htsat-unfused")
-        >>> tokenizer = AutoTokenizer.from_pretrained("ybelkada/clap-htsat-unfused")
+        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
 
@@ -2365,8 +2365,8 @@ def forward(
         >>> from datasets import load_dataset
         >>> from transformers import ClapAudioModelWithProjection, ClapProcessor
 
-        >>> model = ClapAudioModelWithProjection.from_pretrained("ybelkada/clap-htsat-fused")
-        >>> processor = ClapProcessor.from_pretrained("ybelkada/clap-htsat-fused")
+        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
+        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
 
         >>> dataset = load_dataset("ashraq/esc50")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index b711669dbd4b..cc7a060dc4e2 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -628,7 +628,7 @@ def test_integration_unfused(self):
         librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         audio_sample = librispeech_dummy[-1]
 
-        model_id = "ybelkada/clap-htsat-unfused"
+        model_id = "laion/clap-htsat-unfused"
 
         model = ClapModel.from_pretrained(model_id).to(torch_device)
         processor = ClapProcessor.from_pretrained(model_id)
@@ -655,7 +655,7 @@ def test_integration_fused(self):
         librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         audio_sample = librispeech_dummy[-1]
 
-        model_id = "ybelkada/clap-htsat-fused"
+        model_id = "laion/clap-htsat-fused"
 
         model = ClapModel.from_pretrained(model_id).to(torch_device)
         processor = ClapProcessor.from_pretrained(model_id)

From 6856ff0938ce3ebdc0fac299a325a397b8f66f86 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Thu, 16 Feb 2023 16:13:44 +0100
Subject: [PATCH 194/197] Apply suggestions from code review

---
 src/transformers/models/clap/feature_extraction_clap.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index f224feb50e49..1367591fead5 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -44,15 +44,15 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
         feature_size (`int`, defaults to 64):
             The feature dimension of the extracted Mel spectrograms. This corresponds to the number of mel filters
             (`n_mels`).
-        sampling_rate (`int`, defaults to 16000):
+        sampling_rate (`int`, defaults to 48_000):
             The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
             to warn users if the audio fed to the feature extractor does not have the same sampling rate.
-        hop_length (`int`, defaults to 160):
+        hop_length (`int`, defaults to 480):
             Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
             in smaller `frames` with a step of `hop_length` between each frame.
         max_length_s (`int`, defaults to 10):
             The maximum input lenght of the model in seconds. This is used to pad the audio.
-        fft_window_size (`int`, defaults to 400):
+        fft_window_size (`int`, defaults to 1024):
             Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
             resolution of the spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples.
         padding_value (`float`, *optional*, defaults to 0.0):
@@ -73,7 +73,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
             If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy
             of the original mel obtained from the padded audio.
                 - `rand_trunc` will select a random crop of the mel spectrogram.
-        padding (`str`, *optional*):
+        padding (`str`, *optional*, defaults to `"repeatpad"`):
                Padding pattern for shorter audio inputs. Three patterns were originally implemented:
                 - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
                 - `repeat`: the audio is repeated and then cut to fit the `max_length`

From 1ce63636ccace978173eeb790ab1fdbdff2b8431 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 16 Feb 2023 13:47:09 +0000
Subject: [PATCH 195/197] Major refactoring

---
 .../models/clap/configuration_clap.py         |  65 +++--
 src/transformers/models/clap/modeling_clap.py | 232 ++++++------------
 tests/models/clap/test_modeling_clap.py       |  18 +-
 3 files changed, 105 insertions(+), 210 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 9ee5d6855d5e..13d1f7b7e059 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -25,17 +25,17 @@
 logger = logging.get_logger(__name__)
 
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = {
-    "laion-ai/clap-htsat-fused": "https://huggingface.co/laion-ai/clap-htsat-fused/resolve/main/config.json",
-    "laion-ai/clap-htsat-unfused": "https://huggingface.co/laion-ai/clap-htsat-unfused/resolve/main/config.json",
+    "laion/clap-htsat-fused": "https://huggingface.co/laion/clap-htsat-fused/resolve/main/config.json",
+    "laion/clap-htsat-unfused": "https://huggingface.co/laion/clap-htsat-unfused/resolve/main/config.json",
 }
 
 
 class ClapTextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ClapTextModel`]. It is used to instantiate a
-    RoBERTa model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the RoBERTa
-    [roberta-base](https://huggingface.co/roberta-base) architecture.
+    This is the configuration class to store the configuration of a [`ClapTextModel`]. It is used to instantiate a CLAP
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the CLAP
+    [calp-hsat-fused](https://huggingface.co/laion/clap-hsat-fused) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -43,7 +43,7 @@ class ClapTextConfig(PretrainedConfig):
 
     Args:
         vocab_size (`int`, *optional*, defaults to 30522):
-            Vocabulary size of the RoBERTa model. Defines the number of different tokens that can be represented by the
+            Vocabulary size of the CLAP model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`ClapTextModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
@@ -82,13 +82,18 @@ class ClapTextConfig(PretrainedConfig):
             relevant if `config.is_decoder=True`.
         classifier_dropout (`float`, *optional*):
             The dropout ratio for the classification head.
+        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        projection_dim (`int`, *optional*, defaults to 512)
+            Dimension of the projection head of the `ClapTextModelWithProjection`.
 
     Examples:
 
     ```python
     >>> from transformers import ClapTextConfig, ClapTextModel
 
-    >>> # Initializing a RoBERTa configuration
+    >>> # Initializing a CLAP text configuration
     >>> configuration = ClapTextConfig()
 
     >>> # Initializing a model (with random weights) from the configuration
@@ -97,13 +102,12 @@ class ClapTextConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "roberta"
+    model_type = "clap_text_model"
 
     def __init__(
         self,
         vocab_size=50265,
         hidden_size=768,
-        fusion_hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
         intermediate_size=3072,
@@ -115,20 +119,20 @@ def __init__(
         initializer_range=0.02,
         initializer_factor=1.0,
         layer_norm_eps=1e-12,
-        projection_hidden_size=768,
+        projection_dim=512,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
+        projection_hidden_act="relu",
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
-        self.fusion_hidden_size = fusion_hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.hidden_act = hidden_act
@@ -143,7 +147,8 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
         self.classifier_dropout = classifier_dropout
-        self.projection_hidden_size = projection_hidden_size
+        self.projection_hidden_act = projection_hidden_act
+        self.projection_dim = projection_dim
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
@@ -167,7 +172,7 @@ class ClapAudioConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`ClapAudioModel`]. It is used to instantiate a
     CLAP audio encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the audio encoder of the CLAP
-    [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
+    [laion/clap-htsat-fused](https://huggingface.co/laion/clap-htsat-fused) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -190,10 +195,11 @@ class ClapAudioConfig(PretrainedConfig):
             Patch stride for the audio spectrogram
         num_classes (`int`, *optional*, defaults to 527):
             Number of classes used for the head training
-        hidden_size (`int`, *optional*, defaults to 96):
-            Hidden size of the audio model
-        projection_hidden_size (`int`, *optional*, defaults to 768):
-            Hidden size of the projection layer
+        hidden_size (`int`, *optional*, defaults to 768):
+            Hidden size of the output of the audio encoder. Correspond to the dimension of the penultimate layer's
+            output,which is sent to the projection MLP layer.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Hidden size of the projection layer.
         depths (`list`, *optional*, defaults to `[2, 2, 6, 2]`):
             Depths used for the Swin Layers of the audio model
         num_attention_heads (`list`, *optional*, defaults to `[4, 8, 16, 32]`):
@@ -210,7 +216,7 @@ class ClapAudioConfig(PretrainedConfig):
         flatten_patch_embeds (`bool`, *optional*, defaults to `True`):
             Whether or not to flatten the patch embeddings
         patch_embeds_hidden_size (`int`, *optional*, defaults to 96):
-            Hidden size of the patch embeddings
+            Hidden size of the patch embeddings. It is used as the number of output channels.
         enable_patch_layer_norm (`bool`, *optional*, defaults to `True`):
             Whether or not to enable layer normalization for the patch embeddings
         drop_path_rate (`float`, *optional*, defaults to 0.0):
@@ -239,10 +245,10 @@ class ClapAudioConfig(PretrainedConfig):
     ```python
     >>> from transformers import ClapAudioConfig, ClapAudioModel
 
-    >>> # Initializing a ClapAudioConfig with laion-ai/base style configuration
+    >>> # Initializing a ClapAudioConfig with laion/clap-htsat-fused style configuration
     >>> configuration = ClapAudioConfig()
 
-    >>> # Initializing a ClapAudioModel (with random weights) from the laion-ai/base style configuration
+    >>> # Initializing a ClapAudioModel (with random weights) from the laion/clap-htsat-fused style configuration
     >>> model = ClapAudioModel(configuration)
 
     >>> # Accessing the model configuration
@@ -260,8 +266,8 @@ def __init__(
         patch_size=4,
         patch_stride=[4, 4],
         num_classes=527,
-        hidden_size=96,
-        projection_hidden_size=768,
+        hidden_size=768,
+        projection_dim=512,
         depths=[2, 2, 6, 2],
         num_attention_heads=[4, 8, 16, 32],
         enable_fusion=False,
@@ -298,7 +304,7 @@ def __init__(
         self.fusion_type = fusion_type
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
-        self.projection_hidden_size = projection_hidden_size
+        self.projection_dim = projection_dim
         self.flatten_patch_embeds = flatten_patch_embeds
         self.patch_embeds_hidden_size = patch_embeds_hidden_size
         self.enable_patch_layer_norm = enable_patch_layer_norm
@@ -334,7 +340,7 @@ class ClapConfig(PretrainedConfig):
     [`ClapConfig`] is the configuration class to store the configuration of a [`ClapModel`]. It is used to instantiate
     a CLAP model according to the specified arguments, defining the text model and audio model configs. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the CLAP
-    [laion-ai/base](https://huggingface.co/laion-ai/base) architecture.
+    [laion/clap-htsat-fused](https://huggingface.co/laion/clap-htsat-fused) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -348,10 +354,6 @@ class ClapConfig(PretrainedConfig):
             Dimentionality of text and audio projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
             The inital value of the *logit_scale* paramter. Default is used as per the original CLAP implementation.
-        fusion_num_hidden_layers (`int`, *optional*, defaults to 2):
-            Number of hidden layers in the fusion layer.
-        projection_dim (`int`, *optional*, defaults to 512):
-            Dimentionality of text and audio projection layers.
         projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
             Activation function for the projection layers.
         initializer_factor (`float`, *optional*, defaults to 1.0):
@@ -391,7 +393,6 @@ def __init__(
         text_config=None,
         audio_config=None,
         logit_scale_init_value=(1 / 0.07),
-        fusion_num_hidden_layers=2,
         projection_dim=512,
         projection_hidden_act="relu",
         initializer_factor=1.0,
@@ -409,10 +410,6 @@ def __init__(
 
         self.text_config = ClapTextConfig(**text_config)
         self.audio_config = ClapAudioConfig(**audio_config)
-
-        self.text_config.fusion_num_hidden_layers = fusion_num_hidden_layers
-        self.audio_config.fusion_num_hidden_layers = fusion_num_hidden_layers
-
         self.text_config.projection_dim = projection_dim
         self.audio_config.projection_dim = projection_dim
 
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index f7902069aa6e..8664402299ec 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -43,10 +43,10 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "laion-ai/clap-htsat-fused"
+_CHECKPOINT_FOR_DOC = "laion/clap-htsat-fused"
 
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "laion-ai/clap-htsat-fused",
+    "laion/clap-htsat-fused",
     "laion/clap-htsat-unfused",
     # See all clap models at https://huggingface.co/models?filter=clap
 ]
@@ -128,7 +128,7 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l
 
 
 # contrastive loss function, adapted from
-# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/Clip.html
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html#CLIP-loss-function
 def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
     labels = torch.arange(len(logits), device=logits.device)
     return nn.functional.cross_entropy(logits, labels)
@@ -170,54 +170,9 @@ class ClapAudioModelOutput(ModelOutput):
     ClapAudio model output to mimic the output of the original implementation.
 
     Args:
-        framewise_output (`torch.FloatTensor` of shape `(batch_size, reshaped_hidden_size, num_classes)`):
-            Output hidden_states that are interpolated after applying sigmoid. These logits are used to compute the the
-            classification label in the original implementation.
-        clipwise_output (`torch.FloatTensor` of shape `(batch_size, hidden_size, num_classes)`):
-            Output hidden_states after applying sigmoid. These logits are used to compute the the classification label
-            in the original implementation.
-        fine_grained_embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
-            Pooled interpolated hidden_states.
-        embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-    """
-
-    framewise_output: torch.FloatTensor = None
-    clipwise_output: torch.FloatTensor = None
-    fine_grained_embedding: torch.FloatTensor = None
-    embedding: torch.FloatTensor = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class ClapAudioModelOutputWithProjection(ModelOutput):
-    """
-    ClapAudio model output to mimic the output of the original implementation.
-
-    Args:
-        audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
-            The audio embeddings obtained by applying the projection layer to the pooler_output.
-        framewise_output (`torch.FloatTensor` of shape `(batch_size, reshaped_hidden_size, num_classes)`):
-            Output hidden_states that are interpolated after applying sigmoid. These logits are used to compute the the
-            classification label in the original implementation.
-        clipwise_output (`torch.FloatTensor` of shape `(batch_size, hidden_size, num_classes)`):
-            Output hidden_states after applying sigmoid. These logits are used to compute the the classification label
-            in the original implementation.
-        fine_grained_embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
-            Pooled interpolated hidden_states.
-        embedding (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+        audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+            The Audio embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
@@ -233,12 +188,9 @@ class ClapAudioModelOutputWithProjection(ModelOutput):
     """
 
     audio_embeds: Optional[torch.FloatTensor] = None
-    framewise_output: torch.FloatTensor = None
-    clipwise_output: torch.FloatTensor = None
-    fine_grained_embedding: torch.FloatTensor = None
-    embedding: torch.FloatTensor = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    last_hidden_state: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
 @dataclass
@@ -307,7 +259,8 @@ def forward(self, hidden_states):
 # Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133
 class ClapAudioAFFBlock(nn.Module):
     r"""
-    AFF Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement the 1D version.
+    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
+    the 1D version.
     """
 
     def __init__(self, config: ClapAudioConfig):
@@ -877,7 +830,7 @@ def __init__(self, config):
         self.spec_size = config.spec_size
         self.freq_ratio = self.spec_size // config.num_mel_bins
 
-        self.num_features = int(config.hidden_size * 2 ** (self.num_layers - 1))
+        self.num_features = int(config.patch_embeds_hidden_size * 2 ** (self.num_layers - 1))
         self.freq_ratio = config.spec_size // config.num_mel_bins
 
         drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
@@ -888,7 +841,7 @@ def __init__(self, config):
             [
                 ClapAudioStage(
                     config=config,
-                    dim=int(config.hidden_size * 2**i_layer),
+                    dim=int(config.patch_embeds_hidden_size * 2**i_layer),
                     input_resolution=self.input_resolutions[i_layer],
                     depth=config.depths[i_layer],
                     num_heads=config.num_attention_heads[i_layer],
@@ -904,19 +857,8 @@ def __init__(self, config):
         self.batch_norm = nn.BatchNorm2d(config.num_mel_bins)
         self.norm = nn.LayerNorm(self.num_features)
         self.depths = config.depths
-
         self.avgpool = nn.AdaptiveAvgPool1d(1)
 
-        division_factor = (2 ** (len(config.depths) - 1)) * self.patch_embed.patch_stride[0] * self.freq_ratio
-        kernel_size = config.spec_size // division_factor
-        self.tscam_conv = nn.Conv2d(
-            in_channels=self.num_features,
-            out_channels=config.num_classes,
-            kernel_size=(kernel_size, 3),
-            padding=(0, 1),
-        )
-        self.head = nn.Linear(config.num_classes, config.num_classes)
-
     def reshape_mel2img(self, normalized_input_features):
         """
         The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
@@ -1042,60 +984,46 @@ def custom_forward(*inputs):
             if output_attentions:
                 all_self_attentions += layer_outputs[3:]
 
-        hidden_states = self.norm(hidden_states)
+        last_hidden_state = self.norm(hidden_states)
 
-        batch_size, _, n_channels = hidden_states.shape
+        batch_size, _, n_channels = last_hidden_state.shape
 
         freq_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
         temporal_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
 
-        hidden_states = (
-            hidden_states.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
+        last_hidden_state = (
+            last_hidden_state.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
         )
 
-        batch_size, n_channels, n_frequencies, n_temp = hidden_states.shape
+        batch_size, n_channels, n_frequencies, n_temp = last_hidden_state.shape
         # group 2D CNN
         c_freq_bin = n_frequencies // self.freq_ratio
-        hidden_states = hidden_states.reshape(batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp)
-        hidden_states = (
-            hidden_states.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
+        last_hidden_state = last_hidden_state.reshape(
+            batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp
         )
-        # get latent_output
-        fine_grained_latent_output = torch.mean(hidden_states, dim=2)
-        fine_grained_latent_output = interpolate(
-            fine_grained_latent_output.permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
+        last_hidden_state = (
+            last_hidden_state.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
         )
-
-        latent_output = self.avgpool(torch.flatten(hidden_states, 2))
+        latent_output = self.avgpool(torch.flatten(last_hidden_state, 2))
         latent_output = torch.flatten(latent_output, 1)
 
-        hidden_states = self.tscam_conv(hidden_states)
-        hidden_states = torch.flatten(hidden_states, 2)  # B, C, T
-
-        framewise_output = interpolate(
-            torch.sigmoid(hidden_states).permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
-        )
-
-        hidden_states = self.avgpool(hidden_states)
-        hidden_states = torch.flatten(hidden_states, 1)
-
         if not return_dict:
-            return (
-                framewise_output,
-                torch.sigmoid(hidden_states),
-                fine_grained_latent_output,
-                latent_output,
-                all_self_attentions,
-                all_reshaped_hidden_states,
+            return tuple(
+                v
+                for v in [
+                    last_hidden_state,
+                    latent_output,
+                    all_reshaped_hidden_states,
+                    all_self_attentions,
+                ]
+                if v is not None
             )
 
-        return ClapAudioModelOutput(
-            framewise_output=framewise_output,
-            clipwise_output=torch.sigmoid(hidden_states),
-            fine_grained_embedding=fine_grained_latent_output,
-            embedding=latent_output,
-            attentions=all_self_attentions,
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=latent_output,
             hidden_states=all_reshaped_hidden_states,
+            attentions=all_self_attentions,
         )
 
 
@@ -1202,28 +1130,11 @@ def custom_forward(*inputs):
 """
 
 
-class ClapFusionBlock(nn.Module):
-    def __init__(self, config: ClapTextConfig):
-        super().__init__()
-        self.config = config
-        hidden_size = config.projection_dim
-        self.activation = ACT2FN[config.hidden_act]
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-        self.linear = nn.Linear(hidden_size, hidden_size)
-
-    def forward(self, hidden_states):
-        hidden_states = self.linear(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        return hidden_states
-
-
 class ClapProjectionLayer(nn.Module):
-    def __init__(self, config: ClapAudioConfig):
+    def __init__(self, config: Union[ClapAudioConfig, ClapTextConfig]):
         super().__init__()
         self.config = config
-        hidden_size = config.projection_hidden_size
+        hidden_size = config.hidden_size
         projection_dim = config.projection_dim
 
         self.linear1 = nn.Linear(hidden_size, projection_dim)
@@ -1237,20 +1148,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class ClapFusionLayer(nn.Module):
-    def __init__(self, config: ClapTextConfig):
-        super().__init__()
-        self.config = config
-
-        self.layers = nn.ModuleList([ClapFusionBlock(config) for _ in range(config.fusion_num_hidden_layers)])
-
-    def forward(self, hidden_states):
-        for layer in self.layers:
-            hidden_states = layer(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->ClapText
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->ClapText, persistent=False->persistent=True
 class ClapTextEmbeddings(nn.Module):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
@@ -1271,7 +1169,7 @@ def __init__(self, config):
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
         self.register_buffer(
-            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=True
         )
 
         # End copy
@@ -1848,7 +1746,7 @@ def forward(
         >>> inputs = processor(audios=audio_sample, return_tensors="pt")
 
         >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.embedding
+        >>> last_hidden_state = outputs.audio_emmbeds
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -2037,6 +1935,7 @@ def forward(
 @add_start_docstrings(CLAP_START_DOCSTRING)
 class ClapModel(ClapPreTrainedModel):
     config_class = ClapConfig
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def __init__(self, config: ClapConfig):
         super().__init__(config)
@@ -2060,15 +1959,11 @@ def __init__(self, config: ClapConfig):
         self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value))
 
         self.projection_dim = config.projection_dim
-        self.text_hidden_size = text_config.hidden_size
-        self.audio_hidden_size = audio_config.hidden_size
 
         self.text_model = ClapTextModel(text_config)
-        self.text_transform = ClapFusionLayer(text_config)
         self.text_projection = ClapProjectionLayer(text_config)
 
-        self.audio_model = ClapAudioModel(config=audio_config)
-        self.audio_transform = ClapFusionLayer(audio_config)
+        self.audio_model = ClapAudioModel(audio_config)
         self.audio_projection = ClapProjectionLayer(audio_config)
 
         # Initialize weights and apply final processing
@@ -2116,7 +2011,7 @@ def get_text_features(
             return_dict=return_dict,
         )
 
-        pooled_output = text_outputs[1]
+        pooled_output = text_outputs[1] if return_dict is not None else text_outputs.pooler_output
         text_features = self.text_projection(pooled_output)
         text_features = F.normalize(text_features, dim=-1)
 
@@ -2132,7 +2027,23 @@ def get_audio_features(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> torch.FloatTensor:
-        r""" """
+        r"""
+        Returns:
+            audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
+            applying the projection layer to the pooled output of [`ClapAudioModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoFeatureExtractor, ClapModel
+        >>> import torch
+
+        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
+        >>> random_audio = torch.rand((16_000))
+        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
+        >>> audio_features = model.get_audio_features(**inputs)
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -2145,7 +2056,7 @@ def get_audio_features(
             return_dict=return_dict,
         )
 
-        pooled_output = audio_outputs[-1] if not return_dict else audio_outputs.embedding
+        pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output
 
         audio_features = self.audio_projection(pooled_output)
         audio_features = F.normalize(audio_features, dim=-1)
@@ -2211,10 +2122,10 @@ def forward(
             return_dict=return_dict,
         )
 
-        audio_embeds = audio_outputs[-3] if not return_dict else audio_outputs.embedding
+        audio_embeds = audio_outputs[1] if not return_dict else audio_outputs.pooler_output
         audio_embeds = self.audio_projection(audio_embeds)
 
-        text_embeds = text_outputs[1]
+        text_embeds = text_outputs[1] if not return_dict else text_outputs.pooler_output
         text_embeds = self.text_projection(text_embeds)
 
         # normalized features
@@ -2308,7 +2219,7 @@ def forward(
             return_dict=return_dict,
         )
 
-        pooled_output = text_outputs[1]
+        pooled_output = text_outputs[1] if not return_dict else text_outputs.pooler_output
 
         text_embeds = self.text_projection(pooled_output)
 
@@ -2336,9 +2247,7 @@ class ClapAudioModelWithProjection(ClapPreTrainedModel):
 
     def __init__(self, config: ClapAudioConfig):
         super().__init__(config)
-
         self.audio_model = ClapAudioModel(config)
-
         self.audio_projection = ClapProjectionLayer(config)
         # Initialize weights and apply final processing
         self.post_init()
@@ -2389,20 +2298,17 @@ def forward(
             return_dict=return_dict,
         )
 
-        pooled_output = audio_outputs[-3] if not return_dict else audio_outputs.embedding
+        pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output
 
         audio_embeds = self.audio_projection(pooled_output)
 
         if not return_dict:
-            outputs = (audio_embeds, *audio_outputs)
-            return outputs
+            outputs = (audio_embeds, audio_outputs[0]) + audio_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
 
-        return ClapAudioModelOutputWithProjection(
+        return ClapAudioModelOutput(
             audio_embeds=audio_embeds,
-            framewise_output=audio_outputs.framewise_output,
-            clipwise_output=audio_outputs.clipwise_output,
-            fine_grained_embedding=audio_outputs.fine_grained_embedding,
-            embedding=audio_outputs.embedding,
+            last_hidden_state=audio_outputs.last_hidden_state,
             attentions=audio_outputs.attentions,
             hidden_states=audio_outputs.hidden_states,
         )
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index cc7a060dc4e2..d34611a22237 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -66,9 +66,8 @@ def __init__(
         freq_ratio=2,
         num_channels=3,
         is_training=True,
-        hidden_size=32,
+        hidden_size=256,
         patch_embeds_hidden_size=32,
-        projection_hidden_size=256,
         projection_dim=32,
         num_hidden_layers=4,
         num_heads=[2, 2, 2, 2],
@@ -91,7 +90,6 @@ def __init__(
         self.num_hidden_layers = num_hidden_layers
         self.num_heads = num_heads
         self.num_attention_heads = num_heads[0]
-        self.projection_hidden_size = projection_hidden_size
         self.seq_length = seq_length
         self.spec_size = spec_size
         self.freq_ratio = freq_ratio
@@ -128,7 +126,6 @@ def get_config(self):
             spec_size=self.spec_size,
             freq_ratio=self.freq_ratio,
             patch_embeds_hidden_size=self.patch_embeds_hidden_size,
-            projection_hidden_size=self.projection_hidden_size,
         )
 
     def create_and_check_model(self, config, input_features):
@@ -137,11 +134,7 @@ def create_and_check_model(self, config, input_features):
         model.eval()
         with torch.no_grad():
             result = model(input_features)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        embedding_shape = self.hidden_size * self.window_size * self.freq_ratio
-        self.parent.assertEqual(
-            result.fine_grained_embedding.shape, (self.batch_size, embedding_shape, embedding_shape)
-        )
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
 
     def create_and_check_model_with_projection(self, config, input_features):
         model = ClapAudioModelWithProjection(config=config)
@@ -149,7 +142,7 @@ def create_and_check_model_with_projection(self, config, input_features):
         model.eval()
         with torch.no_grad():
             result = model(input_features)
-        self.parent.assertEqual(result.audio_embeds.shape, (self.batch_size, self.hidden_size))
+        self.parent.assertEqual(result.audio_embeds.shape, (self.batch_size, self.projection_dim))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -200,7 +193,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+            hidden_states = outputs.hidden_states
 
             expected_num_layers = getattr(
                 self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
@@ -209,7 +202,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),
-                [self.model_tester.hidden_size, self.model_tester.hidden_size],
+                [self.model_tester.patch_embeds_hidden_size, self.model_tester.patch_embeds_hidden_size],
             )
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -341,7 +334,6 @@ def get_config(self):
         return ClapTextConfig(
             vocab_size=self.vocab_size,
             hidden_size=self.hidden_size,
-            projection_hidden_size=self.hidden_size,
             projection_dim=self.projection_dim,
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_attention_heads,

From a8dc9a49fd1c618587af6ec67dbead8dc22ef307 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 15 Feb 2023 13:15:43 +0100
Subject: [PATCH 196/197] Update
 src/transformers/models/clap/configuration_clap.py

---
 src/transformers/models/clap/configuration_clap.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 13d1f7b7e059..9de345eaa0be 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -26,7 +26,11 @@
 
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = {
     "laion/clap-htsat-fused": "https://huggingface.co/laion/clap-htsat-fused/resolve/main/config.json",
+<<<<<<< HEAD
     "laion/clap-htsat-unfused": "https://huggingface.co/laion/clap-htsat-unfused/resolve/main/config.json",
+=======
+    "laion-ai/clap-htsat-unfused": "https://huggingface.co/laion-ai/clap-htsat-unfused/resolve/main/config.json",
+>>>>>>> b1e75d9e3... Update src/transformers/models/clap/configuration_clap.py
 }
 
 
From 9b5b252cc59f6edf41ed2c6400c49e8e2f130eb6 Mon Sep 17 00:00:00 2001
From: "arthur.zucker@gmail.com" <arthur.zucker@gmail.com>
Date: Thu, 16 Feb 2023 16:26:16 +0000
Subject: [PATCH 197/197] merge

---
 src/transformers/models/clap/configuration_clap.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 9de345eaa0be..13d1f7b7e059 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -26,11 +26,7 @@
 
 CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = {
     "laion/clap-htsat-fused": "https://huggingface.co/laion/clap-htsat-fused/resolve/main/config.json",
-<<<<<<< HEAD
     "laion/clap-htsat-unfused": "https://huggingface.co/laion/clap-htsat-unfused/resolve/main/config.json",
-=======
-    "laion-ai/clap-htsat-unfused": "https://huggingface.co/laion-ai/clap-htsat-unfused/resolve/main/config.json",
->>>>>>> b1e75d9e3... Update src/transformers/models/clap/configuration_clap.py
 }