huggingface · Deep-unlearning · Apr 29, 2025 · Apr 29, 2025 · May 20, 2025 · May 20, 2025
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -955,6 +955,8 @@
         title: Whisper
       - local: model_doc/xcodec
         title: X-Codec
+      - local: model_doc/xcodec2
+        title: X-Codec2
       - local: model_doc/xls_r
         title: XLS-R
       - local: model_doc/xlsr_wav2vec2

diff --git a/docs/source/en/model_doc/xcodec2.md b/docs/source/en/model_doc/xcodec2.md
@@ -0,0 +1,84 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on 2025-02-06 and added to Hugging Face Transformers on 2025-04-29.*
+
+# X-Codec2
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+
+The X-Codec2 model was proposed in [Llasa: Scaling Train-Time and Inference-Time Compute for Llama-based Speech Synthesis](https://huggingface.co/papers/2502.04128).
+
+X-Codec2 is a neural audio codec designed to improve speech synthesis and general audio generation for large language model (LLM) pipelines. It extends the original X-Codec by refining how semantic and acoustic information is integrated and tokenized, enabling efficient and high-fidelity audio representation.
+
+Its architecture is based on [X-Codec](./xcodec) with several major differences:
+
+- **Unified Semantic-Acoustic Tokenization**: X-Codec2 fuses outputs from a semantic encoder (e.g., Wav2Vec2-BERT) and an acoustic encoder into a single embedding, capturing both high-level meaning (e.g., text content, emotion) and low-level audio details (e.g., timbre).
+- **Single-Stage Vector Quantization (VQ)**: Unlike the multi-layer residual VQ in most approaches (e.g., [X-Codec](./xcodec), [DAC](./dac), [EnCodec](./encodec)), X-Codec2 uses a single-layer Feature-Space Quantization (FSQ) for stability and compatibility with causal, autoregressive LLMs.
+- **Semantic Supervision During Training**: It adds a semantic reconstruction loss, ensuring that the discrete tokens preserve meaningful linguistic and emotional information — crucial for TTS tasks.
+- **Transformer-Friendly Design**: The 1D token structure of X-Codec2 naturally aligns with the autoregressive modeling in LLMs like LLaMA, improving training efficiency and downstream compatibility.
+
+## Usage example 
+
+Here is a quick example of how to encode and decode an audio using this model:
+
+```python 
+>>> import torch
+>>> from datasets import Audio, load_dataset
+>>> from transformers import AutoFeatureExtractor, Xcodec2Model
+
+>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+
+>>> # load model and feature extractor
+>>> model_id = "hf-audio/xcodec2"
+>>> model = Xcodec2Model.from_pretrained(model_id).to(torch_device).eval()
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
+
+>>> # load data
+>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
+>>> audio = dataset[0]["audio"]["array"]
+
+>>> # prepare data
+>>> inputs = feature_extractor(raw_audio=audio, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt").to(torch_device)
+
+>>> # encoder and decode
+>>> audio_codes = model.encode(inputs["input_values"]).audio_codes
+>>> audio_values = model.decode(audio_codes).audio_values
+>>> # or the equivalent with a forward pass
+>>> model_output = model(inputs["input_values"])
+>>> audio_codes = model_output.audio_codes
+>>> audio_values = model_output.audio_values
+```
+
+This model was contributed by [Steven Zheng](https://huggingface.co/Steveeeeeeen) and [Eric Bezzam](https://huggingface.co/bezzam).
+The original code can be found [here](https://github.com/zhenye234/X-Codec-2.0).
+
+
+## Xcodec2Config
+
+[[autodoc]] Xcodec2Config
+
+## Xcodec2Model
+
+[[autodoc]] Xcodec2Model
+    - decode
+    - encode
+    - forward
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -363,6 +363,7 @@
     from .whisper import *
     from .x_clip import *
     from .xcodec import *
+    from .xcodec2 import *
     from .xglm import *
     from .xlm import *
     from .xlm_roberta import *

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -425,6 +425,7 @@
         ("whisper", "WhisperConfig"),
         ("xclip", "XCLIPConfig"),
         ("xcodec", "XcodecConfig"),
+        ("xcodec2", "Xcodec2Config"),
         ("xglm", "XGLMConfig"),
         ("xlm", "XLMConfig"),
         ("xlm-prophetnet", "XLMProphetNetConfig"),
@@ -866,6 +867,7 @@
         ("whisper", "Whisper"),
         ("xclip", "X-CLIP"),
         ("xcodec", "X-CODEC"),
+        ("xcodec2", "X-CODEC2"),
         ("xglm", "XGLM"),
         ("xlm", "XLM"),
         ("xlm-prophetnet", "XLM-ProphetNet"),

diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
@@ -116,6 +116,7 @@
         ("whisper", "WhisperFeatureExtractor"),
         ("xclip", "CLIPFeatureExtractor"),
         ("xcodec", "DacFeatureExtractor"),
+        ("xcodec2", "Xcodec2FeatureExtractor"),
         ("yolos", "YolosFeatureExtractor"),
     ]
 )

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -405,6 +405,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("whisper", "WhisperModel"),
         ("xclip", "XCLIPModel"),
         ("xcodec", "XcodecModel"),
+        ("xcodec2", "Xcodec2Model"),
         ("xglm", "XGLMModel"),
         ("xlm", "XLMModel"),
         ("xlm-prophetnet", "XLMProphetNetModel"),

diff --git a/src/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py b/src/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py
@@ -89,36 +89,12 @@ def __init__(
 
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
 
-    @staticmethod
-    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
-    def zero_mean_unit_var_norm(
-        input_values: list[np.ndarray], attention_mask: list[np.ndarray], padding_value: float = 0.0
-    ) -> list[np.ndarray]:
-        """
-        Every array in the list is normalized to have zero mean and unit variance
-        """
-        if attention_mask is not None:
-            attention_mask = np.array(attention_mask, np.int32)
-            normed_input_values = []
-
-            for vector, length in zip(input_values, attention_mask.sum(-1)):
-                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
-                if length < normed_slice.shape[0]:
-                    normed_slice[length:] = padding_value
-
-                normed_input_values.append(normed_slice)
-        else:
-            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
-
-        return normed_input_values
-
     def _extract_fbank_features(
         self,
         waveform: np.ndarray,
     ) -> np.ndarray:
         """
-        Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
-        and hence the waveform should not be normalized before feature extraction.
+        Get mel-filter bank features using Numpy method to mimic Kaldi.
         """
         # by default, it extracts the left channel if stereo
         if len(waveform.shape) == 2:

diff --git a/src/transformers/models/xcodec/modeling_xcodec.py b/src/transformers/models/xcodec/modeling_xcodec.py
@@ -539,7 +539,7 @@ def forward(
 
         >>> inputs = feature_extractor(raw_audio=audio_sample, return_tensors="pt")
 
-        >>> outputs = model(**inputs)
+        >>> outputs = model(inputs["input_values"])
         >>> audio_codes = outputs.audio_codes
         >>> audio_values = outputs.audio_values
         ```

diff --git a/src/transformers/models/xcodec2/__init__.py b/src/transformers/models/xcodec2/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_xcodec2 import *
+    from .feature_extraction_xcodec2 import *
+    from .modeling_xcodec2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)