Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
127 commits
Select commit Hold shift + click to select a range
7b502bb
add working vocos
Manalelaidouni Jul 14, 2025
30a17e7
update vocos
Manalelaidouni Jul 16, 2025
33a715e
refactor vocos head
Manalelaidouni Jul 19, 2025
f6026d9
fix docstring
Manalelaidouni Jul 19, 2025
7fa04e0
nit
Manalelaidouni Jul 19, 2025
910c500
Merge branch 'huggingface:main' into add-vocos-model
Manalelaidouni Jul 22, 2025
09cf23f
fix output mismatch
Manalelaidouni Jul 22, 2025
d909e64
update checkpoint conversions
Manalelaidouni Jul 24, 2025
6709a97
add working vocos
Manalelaidouni Jul 14, 2025
01913f5
update vocos
Manalelaidouni Jul 16, 2025
c42d8b9
refactor vocos head
Manalelaidouni Jul 19, 2025
b987b13
fix docstring
Manalelaidouni Jul 19, 2025
ea73b18
nit
Manalelaidouni Jul 19, 2025
1fab4f5
fix output mismatch
Manalelaidouni Jul 22, 2025
e81574a
update checkpoint conversions
Manalelaidouni Jul 24, 2025
324b7c7
Merge branch 'main' into add-vocos-model
Manalelaidouni Aug 11, 2025
3f469f7
Merge branch 'add-vocos-model' of https://github.com/Manalelaidouni/t…
Manalelaidouni Aug 11, 2025
35d7545
fix adaptive layer norm
Manalelaidouni Aug 12, 2025
5559fd1
fix conflict
Manalelaidouni Aug 21, 2025
229464a
fix auto
Manalelaidouni Aug 22, 2025
3f123cb
nit
Manalelaidouni Aug 22, 2025
b0fc62d
Merge branch 'huggingface:main' into add-vocos-model
Manalelaidouni Sep 8, 2025
c15786d
add VocosProcessor and refactor
Manalelaidouni Sep 11, 2025
8373fd7
nit
Manalelaidouni Sep 11, 2025
a869548
clean up
Manalelaidouni Sep 11, 2025
e21abe1
Update docs.
ebezzam Sep 12, 2025
643d41d
add bark example to docs
Manalelaidouni Sep 15, 2025
30f2a66
make fixture file shorter
Manalelaidouni Sep 15, 2025
208cc5c
Update docs.
ebezzam Sep 15, 2025
191f2a7
Fix tensor device for tests.
ebezzam Sep 15, 2025
688e915
Nits
ebezzam Sep 15, 2025
948a4c0
recreate fixtures
Manalelaidouni Sep 19, 2025
2f7db93
add torch backend + batching support
Manalelaidouni Sep 19, 2025
93c3fde
add tests for both numpy and torch backends
Manalelaidouni Sep 19, 2025
e93807a
add batch integration test for mel and encodec
Manalelaidouni Sep 19, 2025
e6e0820
make tests pass
Manalelaidouni Sep 19, 2025
33bddda
update fixtures and tests
Manalelaidouni Sep 20, 2025
d9b2017
Merge branch 'main' into add-vocos-model
Manalelaidouni Sep 20, 2025
5ad61b9
add torchaudio backend + edit batching tests
Manalelaidouni Sep 27, 2025
a8bbd0a
nits
Manalelaidouni Sep 27, 2025
e3c5ae1
update test and fixtures
Manalelaidouni Sep 28, 2025
ad3fdc1
Merge remote-tracking branch 'upstream/main' into add-vocos-model
Manalelaidouni Sep 28, 2025
0b2305a
cleanup
Manalelaidouni Sep 29, 2025
ced9a22
update feature extractor
Manalelaidouni Sep 29, 2025
f5e6463
Merge remote-tracking branch 'upstream/main' into add-vocos-model
Manalelaidouni Oct 3, 2025
cf4c993
Fix small typo.
ebezzam Oct 3, 2025
7e784ee
Small fixes for passing integration tests.
ebezzam Oct 3, 2025
22e7488
More Transformers compatible version, with naming and more amenable t…
ebezzam Oct 3, 2025
c79686b
refactor processor
Manalelaidouni Oct 5, 2025
6746a14
update feature extractor torchaudio + spectogram_batch
Manalelaidouni Oct 5, 2025
388235a
nits
Manalelaidouni Oct 5, 2025
f7e1ce1
edit skipped tests reason
Manalelaidouni Oct 5, 2025
cd001a1
Merge branch 'main' into add-vocos-model
Manalelaidouni Oct 5, 2025
a576daf
Nits.
ebezzam Oct 6, 2025
5d72339
Updated expected outputs.
ebezzam Oct 6, 2025
98c10d2
Add slow decorator.
ebezzam Oct 6, 2025
86d5ee3
Fix import.
ebezzam Oct 6, 2025
8fca95f
Format.
ebezzam Oct 6, 2025
704828d
Move slow decorators to methods.
ebezzam Oct 6, 2025
b77ac67
Simplify feature extraction and more intuitive names.
ebezzam Oct 6, 2025
2dde484
Merge branch 'main' into add-vocos-model
Manalelaidouni Oct 6, 2025
07b1b34
make original vs hf feature extractor match on gpu
Manalelaidouni Oct 6, 2025
62f1cc0
Merge branch 'add-vocos-model' of github.com:Manalelaidouni/transform…
ebezzam Oct 7, 2025
65cb11f
Simplify to just torch support.
ebezzam Oct 7, 2025
e1a1537
Standardize model inputs.
ebezzam Oct 7, 2025
88a6d89
Update docs
ebezzam Oct 7, 2025
6d2a5c1
Merge branch 'main' into add-vocos-model
ebezzam Oct 7, 2025
6c61575
Merge branch 'main' into add-vocos-model
Manalelaidouni Oct 8, 2025
51117cb
clean up
Manalelaidouni Oct 8, 2025
e6e486f
Merge branch 'main' into add-vocos-model
Manalelaidouni Oct 9, 2025
6c1cac7
Add gpu decorator.
ebezzam Oct 9, 2025
57502fc
undo warning
Manalelaidouni Oct 9, 2025
e3db2fb
nits
Manalelaidouni Oct 9, 2025
70e8f7e
Add pad_to_multiple_of and use corresponding hop_length.
ebezzam Oct 9, 2025
d0c7306
Merge branch 'main' into add-vocos-model
Manalelaidouni Oct 10, 2025
68ef6e1
pad only batch
Manalelaidouni Oct 10, 2025
0a03739
reproduce fixtures
Manalelaidouni Oct 10, 2025
8007e8a
minor correction
Manalelaidouni Oct 10, 2025
e8917f9
Address comments.
ebezzam Oct 10, 2025
4afdef3
Reintroduce slow/gpu decorators.
ebezzam Oct 10, 2025
532b33d
change to old fixtures
Manalelaidouni Oct 10, 2025
7c94471
New istft utils and nits.
ebezzam Oct 10, 2025
84bbfc2
Update docs/source/en/model_doc/vocos.md
ebezzam Oct 21, 2025
037a67e
Update src/transformers/models/vocos/configuration_vocos.py
ebezzam Oct 21, 2025
6ae6d20
Flatten backbone and nits.
ebezzam Oct 21, 2025
b5e9fa5
Update mel conversion for flattening.
ebezzam Oct 21, 2025
34e5848
Cleaner mel vocos model.
ebezzam Oct 21, 2025
ec04346
Base simpler Vocos with encodec.
ebezzam Oct 21, 2025
e9c4635
Update convert for encodec variant and integration tests.
ebezzam Oct 21, 2025
90aace1
Nits
ebezzam Oct 21, 2025
d2a51c8
Nits before modular.
ebezzam Oct 21, 2025
db96400
From modular.
ebezzam Oct 21, 2025
5f13298
Revert to input_features.
ebezzam Oct 21, 2025
4a5c2f1
Processor only for encodec variant.
ebezzam Oct 21, 2025
82d1d46
Make style
ebezzam Oct 21, 2025
b342d4f
Add codebook weights to Encodec model.
ebezzam Oct 22, 2025
5490a72
Update modular and nits.
ebezzam Oct 22, 2025
f1a6459
Make style
ebezzam Oct 22, 2025
5300652
Merge upstream/main into add-vocos-model
Manalelaidouni Dec 6, 2025
6dec301
Merge branch 'add-vocos-model' of https://github.com/Manalelaidouni/t…
Manalelaidouni Dec 7, 2025
20fe380
update feature extractor
Manalelaidouni Jan 11, 2026
becbec8
update vocos modeling
Manalelaidouni Jan 11, 2026
5ecd7ec
update vocos encodec modeling
Manalelaidouni Jan 11, 2026
8766aec
correct config
Manalelaidouni Jan 11, 2026
72b7f7a
update fixtures
Manalelaidouni Jan 11, 2026
0b6cf12
update and add pad test to feature extractor
Manalelaidouni Jan 11, 2026
e30a043
update model tests
Manalelaidouni Jan 11, 2026
17f7a61
add processor tests
Manalelaidouni Jan 11, 2026
decf1d3
update processor
Manalelaidouni Jan 11, 2026
4dbd064
nits
Manalelaidouni Jan 11, 2026
d764c62
update docs
Manalelaidouni Jan 11, 2026
4174445
allow EncodecModel as audio_tokenizer
Manalelaidouni Jan 11, 2026
c5012cb
Merge remote-tracking branch 'upstream/main' into add-vocos-model
Manalelaidouni Jan 12, 2026
ea16ce0
correct weight initialization
Manalelaidouni Jan 13, 2026
56c4a0c
fix modeling + feature extractor tests
Manalelaidouni Jan 13, 2026
4c410ba
update modular
Manalelaidouni Jan 14, 2026
5a86436
ruff styling
Manalelaidouni Jan 14, 2026
bd47278
update modular 2
Manalelaidouni Jan 15, 2026
b7bac40
nits
Manalelaidouni Jan 15, 2026
d55b0f1
update auto mapping + tests
Manalelaidouni Jan 20, 2026
0150a11
add codebook_weights buffer to initialization
Manalelaidouni Jan 20, 2026
c504d3b
allow unused config attribute
Manalelaidouni Jan 20, 2026
4cc6166
Merge branch 'main' into add-vocos-model
Manalelaidouni Jan 22, 2026
4b0aec4
skip training tests
Manalelaidouni Jan 23, 2026
39af5a1
update docs
Manalelaidouni Jan 23, 2026
3e8df70
add test decorators
Manalelaidouni Jan 23, 2026
373a5d2
Merge branch 'main' into add-vocos-model
Manalelaidouni Jan 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/source/en/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -987,6 +987,10 @@
title: UnivNet
- local: model_doc/vits
title: VITS
- local: model_doc/vocos
title: Vocos
- local: model_doc/vocos_encodec
title: VocosEncodec
- local: model_doc/wav2vec2
title: Wav2Vec2
- local: model_doc/wav2vec2-bert
Expand Down
113 changes: 113 additions & 0 deletions docs/source/en/model_doc/vocos.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
<!--Copyright 2026 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.

-->
*This model was released on 2023-06-01 and added to Hugging Face Transformers on 2026-01-23.*

# Vocos

<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>

## Overview

The Vocos model was proposed in [**Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis**](https://huggingface.co/papers/2306.00814) by Hubert Siuzdak.

Vocos is a GAN-based neural vocoder designed for high quality audio synthesis in text to speech (TTS) pipelines and related tasks. Traditional time-domain vocoders rely on transposed convolutions for upsampling, which degrades temporal resolution across all layers and introduces aliasing artifacts into synthesized speech.
Instead, Vocos represents audio signals in the time-frequency domain, it's trained to predict the complex Short Time Fourier Transform (STFT) coefficients, magnitude and phase, and uses the computationally inverse STFT (ISTFT) for upsampling, which maintains the same temporal resolution throughout the network and converts directly to speech waveforms.

Vocos delivers the same high audio quality while achieving 30× faster inference speed on CPU and outperforming HiFi-GAN in both VISQOL and PESQ scores.



The abstract of the paper states the following:

*Recent advancements in neural vocoding are predominantly driven by Generative Adversarial Networks (GANs) operating in the time-domain. While effective, this approach neglects the inductive bias offered by time-frequency representations, resulting in reduntant and computionally-intensive upsampling operations. Fourier-based time-frequency representation is an appealing alternative, aligning more accurately with human auditory perception, and benefitting from well-established fast algorithms for its computation. Nevertheless, direct reconstruction of complex-valued spectrograms has been historically problematic, primarily due to phase recovery issues. This study seeks to close this gap by presenting Vocos, a new model that directly generates Fourier spectral coefficients. Vocos not only matches the state-of-the-art in audio quality, as demonstrated in our evaluations, but it also substantially improves computational efficiency, achieving an order of magnitude increase in speed compared to prevailing time-domain neural vocoding approaches.*


Vocos is available in two variants:

- `VocosModel` : Mel-spectrogram based vocoder documented in this card.

- `VocosEncodecModel` : EnCodec based vocoder can be found [here](https://huggingface.co/docs/transformers/model_doc/vocos_encodec).



You can find demos in this [post](https://gemelo-ai.github.io/vocos/). The original implementation can be found [here](https://github.com/gemelo-ai/vocos) and the original checkpoint is available [here](https://huggingface.co/charactr/vocos-mel-24khz).

This model was contributed by [Manal El Aidouni](https://huggingface.co/Manel) and [Eric Bezzam](https://huggingface.co/bezzam).

## Usage


You can extract mel-spectrogram features from an audio using `VocosFeatureExtractor` and feed them into `VocosModel` to generate high quality audio. You can also plug `VocosModel` in as a standalone vocoder component within a larger audio generation pipeline (for example the [YuE](https://github.com/multimodal-art-projection/YuE) model).


```python
from datasets import load_dataset, Audio
from transformers import VocosFeatureExtractor, VocosModel
from scipy.io.wavfile import write as write_wav

# load model and processor
model_id = "hf-audio/vocos-mel-24khz"
feature_extractor = VocosFeatureExtractor.from_pretrained(model_id)
model = VocosModel.from_pretrained(model_id, device_map="auto")
sampling_rate = feature_extractor.sampling_rate

# load audio sample
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate))
audio = ds[0]["audio"]["array"]

inputs = feature_extractor(audio=audio, sampling_rate=sampling_rate).to(model.device)

print(inputs.input_features.shape) # (batch_size, num_mel_bins, frame) [1, 100, 550]

outputs = model(**inputs)

audio = outputs.audio

print(audio.shape) # (batch_size, time) [1, 140544]

# save audio to file
write_wav("vocos.wav", sampling_rate, audio[0].detach().cpu().numpy())
```

In case of processing multiple audio files in batch, you can remove padding from reconstructed audios using `attention_mask` returned in output as such:


```python
inputs = feature_extractor(audio=[audio1, audio2], return_tensors="pt")

outputs = model(**inputs)

reconstructed_audio, attention_mask = outputs.audio, outputs.attention_mask

unpadded_audios = [reconstructed_audio[i][attention_mask[i].bool()].detach().cpu().numpy() for i in range(reconstructed_audio.shape[0])]

```

## VocosConfig

[[autodoc]] VocosConfig

## VocosFeatureExtractor

[[autodoc]] VocosFeatureExtractor

## VocosModel

[[autodoc]] VocosModel
- forward
182 changes: 182 additions & 0 deletions docs/source/en/model_doc/vocos_encodec.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
<!--Copyright 2026 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.

-->
*This model was released on 2023-06-01 and added to Hugging Face Transformers on 2026-01-23.*

# VocosEncodec

<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>

## Overview

The VocosEncodec model is the EnCodec variant of the Vocos model that was proposed in [**Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis**](https://huggingface.co/papers/2306.00814) by Hubert Siuzdak.

Vocos is a GAN-based neural vocoder designed for high quality audio synthesis in text to speech (TTS) pipelines and related tasks. Traditional time-domain vocoders rely on transposed convolutions for upsampling, which degrades temporal resolution across all layers and introduces aliasing artifacts into synthesized speech.
Instead, Vocos represents audio signals in the time-frequency domain, it's trained to predict the complex Short Time Fourier Transform (STFT) coefficients, magnitude and phase, and uses the computationally inverse STFT (ISTFT) for upsampling, which maintains the same temporal resolution throughout the network and converts directly to speech waveforms.

Vocos delivers the same high audio quality while achieving 30× faster inference speed on CPU and outperforming HiFi-GAN in both VISQOL and PESQ scores.



The abstract of the paper states the following:

*Recent advancements in neural vocoding are predominantly driven by Generative Adversarial Networks (GANs) operating in the time-domain. While effective, this approach neglects the inductive bias offered by time-frequency representations, resulting in reduntant and computionally-intensive upsampling operations. Fourier-based time-frequency representation is an appealing alternative, aligning more accurately with human auditory perception, and benefitting from well-established fast algorithms for its computation. Nevertheless, direct reconstruction of complex-valued spectrograms has been historically problematic, primarily due to phase recovery issues. This study seeks to close this gap by presenting Vocos, a new model that directly generates Fourier spectral coefficients. Vocos not only matches the state-of-the-art in audio quality, as demonstrated in our evaluations, but it also substantially improves computational efficiency, achieving an order of magnitude increase in speed compared to prevailing time-domain neural vocoding approaches.*


Vocos is available in two variants:

- `VocosModel` : Mel-spectrogram based vocoder can be found [here](https://huggingface.co/docs/transformers/model_doc/vocos).

- `VocosEncodecModel` : EnCodec based vocoder documented in this card.


You can find demos in this [post](https://gemelo-ai.github.io/vocos/). The original code can be found [here](https://github.com/gemelo-ai/vocos) and the original checkpoint is available [here](https://huggingface.co/charactr/vocos-encodec-24khz).

This model was contributed by [Manal El Aidouni](https://huggingface.co/Manel) and [Eric Bezzam](https://huggingface.co/bezzam).




## Usage

Recent work has increasingly adopted learned neural audio codec features, Vocos supports [EnCodec](https://huggingface.co/docs/transformers/main/en/model_doc/encodec) based reconstruction for high-quality audio generation through `VocosEncodecProcessor`, where the EnCodec neural audio codec model encodes the input audio into discrete tokens using Residual Vector Quantization (RVQ). These codes are then converted into embedding that serve as input to `VocosEncodecModel`.

A desired target `bandwidth` value is required for `VocosEncodecProcessor`. The supported bandwidths are [1.5, 3, 6, 12] kbps. The selected bandwidth determines the number of quantizers/codebooks used by the RVQ of EnCodec, namely [2, 4, 6, 8] quantizers respectively.

```python
from datasets import load_dataset, Audio
from transformers import VocosEncodecModel, VocosEncodecProcessor
from scipy.io.wavfile import write as write_wav

bandwidth = 6.0

# load model and processor
model_id = "hf-audio/vocos-encodec-24khz"
processor = VocosEncodecProcessor.from_pretrained(model_id)
model = VocosEncodecModel.from_pretrained(model_id, device_map="auto")
sampling_rate = processor.feature_extractor.sampling_rate

# load audio sample
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate))
audio = ds[0]["audio"]["array"]

inputs = processor(audio=audio, bandwidth=bandwidth, sampling_rate=sampling_rate).to(model.device)

print(inputs.input_features.shape) # (batch_size, codebook_dim, num_frames) [1, 128, 440]

outputs = model(**inputs)

audio = outputs.audio

print(audio.shape) # (batch_size, time) [1, 140800]

# save audio to file
write_wav("vocos_encodec.wav", sampling_rate, audio[0].detach().cpu().numpy())
```

### Reconstructing audio from quantized RVQ codes

The EnCodec variant can also process precomputed RVQ codes directly. You can provide quantized audio codes as input to the `VocosEncodecProcessor` processor, which converts them into embeddings for the `VocosEncodecModel` model.

```python
from transformers import VocosEncodecModel, VocosEncodecProcessor

model = VocosEncodecModel.from_pretrained("hf-audio/vocos-encodec-24khz")
processor = VocosEncodecProcessor.from_pretrained("hf-audio/vocos-encodec-24khz")
# 8 codeboooks, 200 frames
audio_codes = torch.randint(low=0, high=1024, size=(8, 200))
inputs = processor(codes=audio_codes, bandwidth_id=bandwidth_id)
audio = model(**inputs).audio

```

### Reconstructing audio from Bark tokens

Bark is a text-to-speech model that encodes input text into discrete EnCodec RVQ codes, then uses EnCodec to convert those codes into an audio waveform. The Vocos vocoder is often integrated with Bark instead of relying only on the EnCodec's decoder for better audio quality.

Below is an example using the Transformers implementation of [Bark](./bark) to generate quantized codes from text, then decoding them with ``VocosEncodecProcessor`` and `VocosEncodecModel`:

```python
from transformers import VocosEncodecModel, VocosEncodecProcessor, BarkProcessor, BarkModel
from transformers.models.bark.generation_configuration_bark import BarkSemanticGenerationConfig, BarkCoarseGenerationConfig, BarkFineGenerationConfig
from scipy.io.wavfile import write as write_wav

# load the Bark model and processor
bark_id = "suno/bark-small"
bark_processor = BarkProcessor.from_pretrained(bark_id)
bark = BarkModel.from_pretrained(bark_id, device_map="auto")

text_prompt = "We've been messing around with this new model called Vocos."
bark_inputs = bark_processor(text_prompt, return_tensors="pt").to(bark.device)

# building generation configs for each stage
semantic_generation_config = BarkSemanticGenerationConfig(**bark.generation_config.semantic_config)
coarse_generation_config = BarkCoarseGenerationConfig(**bark.generation_config.coarse_acoustics_config)
fine_generation_config = BarkFineGenerationConfig(**bark.generation_config.fine_acoustics_config)

# generating the RVQ codes
semantic_tokens = bark.semantic.generate(
**bark_inputs,
semantic_generation_config=semantic_generation_config,
)

coarse_tokens = bark.coarse_acoustics.generate(
semantic_tokens,
semantic_generation_config=semantic_generation_config,
coarse_generation_config=coarse_generation_config,
codebook_size=bark.generation_config.codebook_size,
)

fine_tokens = bark.fine_acoustics.generate(
coarse_tokens,
semantic_generation_config=semantic_generation_config,
coarse_generation_config=coarse_generation_config,
fine_generation_config=fine_generation_config,
codebook_size=bark.generation_config.codebook_size,
)

codes = fine_tokens.squeeze(0) # codes (8 codebooks, * frames)

# Reconstruct audio with Vocos from codes
vocos_id = "hf-audio/vocos-encodec-24khz"
processor = VocosEncodecProcessor.from_pretrained(vocos_id)
model = VocosEncodecModel.from_pretrained(vocos_id, device_map="auto")
sampling_rate = processor.feature_extractor.sampling_rate

# generate audio
inputs = processor(codes=codes.to("cpu"), bandwidth=6.0).to(model.device)
audio = model(**inputs).audio

# save audio to file
write_wav("vocos_bark.wav", sampling_rate, audio[0].detach().cpu().numpy())
```

## VocosEncodecConfig

[[autodoc]] VocosEncodecConfig


## VocosEncodecProcessor

[[autodoc]] VocosEncodecProcessor
- __call__

## VocosEncodecModel

[[autodoc]] VocosEncodecModel
- forward
4 changes: 4 additions & 0 deletions src/transformers/audio_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,16 @@
is_librosa_available,
is_numpy_array,
is_soundfile_available,
is_torch_available,
is_torch_tensor,
is_torchcodec_available,
requires_backends,
)


if is_torch_available():
import torch

if TYPE_CHECKING:
import torch

Expand Down
2 changes: 2 additions & 0 deletions src/transformers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,8 @@
from .vits import *
from .vivit import *
from .vjepa2 import *
from .vocos import *
from .vocos_encodec import *
from .voxtral import *
from .wav2vec2 import *
from .wav2vec2_bert import *
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/auto/configuration_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,8 @@
("vits", "VitsConfig"),
("vivit", "VivitConfig"),
("vjepa2", "VJEPA2Config"),
("vocos", "VocosConfig"),
("vocos_encodec", "VocosEncodecConfig"),
("voxtral", "VoxtralConfig"),
("voxtral_encoder", "VoxtralEncoderConfig"),
("wav2vec2", "Wav2Vec2Config"),
Expand Down Expand Up @@ -937,6 +939,8 @@
("vits", "VITS"),
("vivit", "ViViT"),
("vjepa2", "VJEPA2Model"),
("vocos", "Vocos"),
("vocos_encodec", "VocosEncodec"),
("voxtral", "Voxtral"),
("voxtral_encoder", "Voxtral Encoder"),
("wav2vec2", "Wav2Vec2"),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/feature_extraction_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
("unispeech", "Wav2Vec2FeatureExtractor"),
("unispeech-sat", "Wav2Vec2FeatureExtractor"),
("univnet", "UnivNetFeatureExtractor"),
("vocos", "VocosFeatureExtractor"),
("voxtral", "WhisperFeatureExtractor"),
("wav2vec2", "Wav2Vec2FeatureExtractor"),
("wav2vec2-bert", "Wav2Vec2FeatureExtractor"),
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/models/auto/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,8 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
("vits", "VitsModel"),
("vivit", "VivitModel"),
("vjepa2", "VJEPA2Model"),
("vocos", "VocosModel"),
("vocos_encodec", "VocosEncodecModel"),
("voxtral", "VoxtralForConditionalGeneration"),
("voxtral_encoder", "VoxtralEncoder"),
("wav2vec2", "Wav2Vec2Model"),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@
("vilt", "ViltProcessor"),
("vipllava", "LlavaProcessor"),
("vision-text-dual-encoder", "VisionTextDualEncoderProcessor"),
("vocos", "VocosEncodecProcessor"),
("voxtral", "VoxtralProcessor"),
("wav2vec2", "Wav2Vec2Processor"),
("wav2vec2-bert", "Wav2Vec2Processor"),
Expand Down
Loading