embeddings-benchmark · gowitheflow-1998 · Aug 5, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 30, 2025
diff --git a/mteb/models/ast_model.py b/mteb/models/ast_model.py
@@ -93,13 +93,17 @@ def get_audio_embeddings(
         task_name: str | None = None,
         prompt_type: PromptType | None = None,
         batch_size: int = 4,
+        show_progress_bar: bool = True,
         **kwargs: Any,
     ) -> torch.Tensor:
         processed_audio = self._process_audio(audio)
         all_embeddings = []
 
         with torch.no_grad():
-            for i in tqdm(range(0, len(processed_audio), batch_size)):
+            for i in tqdm(
+                range(0, len(processed_audio), batch_size),
+                disable=not show_progress_bar,
+            ):
                 batch = processed_audio[i : i + batch_size]
 
                 # AST processes raw waveforms directly through its feature extractor

diff --git a/mteb/models/clap_models.py b/mteb/models/clap_models.py
@@ -109,13 +109,16 @@ def get_audio_embeddings(
         task_name: str | None = None,
         prompt_type: PromptType | None = None,
         batch_size: int = 4,
+        show_progress_bar: bool = True,
         **kwargs: Any,
     ) -> np.ndarray:
         all_features = []
         processed_audio = self._process_audio(audio)
 
         for i in tqdm(
-            range(0, len(processed_audio), batch_size), desc="Processing audio batches"
+            range(0, len(processed_audio), batch_size),
+            desc="Processing audio batches",
+            disable=not show_progress_bar,
         ):
             batch = processed_audio[i : i + batch_size]
             batch_arrays = [tensor.numpy() for tensor in batch]
@@ -125,6 +128,8 @@ def get_audio_embeddings(
                 sampling_rate=self.sampling_rate,
                 return_tensors="pt",
                 padding=True,
+                truncation=True,
+                max_length=30 * self.sampling_rate,  # 30 seconds max
             )
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
 

diff --git a/mteb/models/cnn14_model.py b/mteb/models/cnn14_model.py
@@ -92,7 +92,14 @@ def _handle_batch(
     def _convert_audio(self, audio: AudioData) -> torch.Tensor:
         if isinstance(audio, np.ndarray):
             audio = torch.from_numpy(audio)
-        return audio.squeeze()
+        audio = audio.squeeze()
+
+        # Apply audio truncation (30 seconds max)
+        max_length = 30 * self.sampling_rate  # 30 seconds
+        if audio.shape[-1] > max_length:
+            audio = audio[..., :max_length]
+
+        return audio
 
     def _load_audio_file(self, path: str) -> torch.Tensor:
         waveform, sample_rate = torchaudio.load(path)
@@ -113,13 +120,17 @@ def get_audio_embeddings(
         task_name: str | None = None,
         prompt_type: PromptType | None = None,
         batch_size: int = 4,
+        show_progress_bar: bool = True,
         **kwargs: Any,
     ) -> torch.Tensor:
         processed_audio = self._process_audio(audio)
         all_embeddings = []
 
         with torch.no_grad():
-            for i in tqdm(range(0, len(processed_audio), batch_size)):
+            for i in tqdm(
+                range(0, len(processed_audio), batch_size),
+                disable=not show_progress_bar,
+            ):
                 batch = processed_audio[i : i + batch_size]
 
                 # Convert batch to tensors and move to device

diff --git a/mteb/models/data2vec_models.py b/mteb/models/data2vec_models.py
@@ -103,13 +103,17 @@ def get_audio_embeddings(
         task_name: str | None = None,
         prompt_type: PromptType | None = None,
         batch_size: int = 4,
+        show_progress_bar: bool = True,
         **kwargs: Any,
     ) -> torch.Tensor:
         processed_audio = self._process_audio(audio)
         all_embeddings = []
 
         with torch.no_grad():
-            for i in tqdm(range(0, len(processed_audio), batch_size)):
+            for i in tqdm(
+                range(0, len(processed_audio), batch_size),
+                disable=not show_progress_bar,
+            ):
                 batch = processed_audio[i : i + batch_size]
 
                 # Pre-process audio
@@ -125,6 +129,8 @@ def get_audio_embeddings(
                     sampling_rate=self.sampling_rate,
                     return_tensors="pt",
                     padding="longest",
+                    truncation=True,
+                    max_length=30 * self.sampling_rate,  # 30 seconds max
                     return_attention_mask=True,
                 ).to(self.device)
 

diff --git a/mteb/models/encodec_model.py b/mteb/models/encodec_model.py
@@ -106,13 +106,17 @@ def get_audio_embeddings(
         task_name: str | None = None,
         prompt_type: PromptType | None = None,
         batch_size: int = 4,
+        show_progress_bar: bool = True,
         **kwargs: Any,
     ) -> torch.Tensor:
         processed_audio = self._process_audio(audio)
         all_embeddings = []
 
         with torch.no_grad():
-            for i in tqdm(range(0, len(processed_audio), batch_size)):
+            for i in tqdm(
+                range(0, len(processed_audio), batch_size),
+                disable=not show_progress_bar,
+            ):
                 batch = processed_audio[i : i + batch_size]
 
                 # Process audio through EnCodec's processor
@@ -121,6 +125,8 @@ def get_audio_embeddings(
                     sampling_rate=self.sampling_rate,
                     return_tensors="pt",
                     padding=True,
+                    truncation=True,
+                    max_length=30 * self.sampling_rate,  # 30 seconds max
                 ).to(self.device)
 
                 # Get the latent representations directly from the encoder

diff --git a/mteb/models/hubert_models.py b/mteb/models/hubert_models.py
@@ -103,13 +103,17 @@ def get_audio_embeddings(
         task_name: str | None = None,
         prompt_type: PromptType | None = None,
         batch_size: int = 4,
+        show_progress_bar: bool = True,
         **kwargs: Any,
     ) -> torch.Tensor:
         processed_audio = self._process_audio(audio)
         all_embeddings = []
 
         with torch.no_grad():
-            for i in tqdm(range(0, len(processed_audio), batch_size)):
+            for i in tqdm(
+                range(0, len(processed_audio), batch_size),
+                disable=not show_progress_bar,
+            ):
                 batch = processed_audio[i : i + batch_size]
 
                 # Pre-process like Wav2Vec2
@@ -125,6 +129,8 @@ def get_audio_embeddings(
                     sampling_rate=self.sampling_rate,
                     return_tensors="pt",
                     padding="longest",
+                    truncation=True,
+                    max_length=30 * self.sampling_rate,  # 30 seconds max
                     return_attention_mask=True,
                 ).to(self.device)
 

diff --git a/mteb/models/mctct_model.py b/mteb/models/mctct_model.py
@@ -167,13 +167,17 @@ def get_audio_embeddings(
         task_name: str | None = None,
         prompt_type: PromptType | None = None,
         batch_size: int = 4,
+        show_progress_bar: bool = True,
         **kwargs: Any,
     ) -> torch.Tensor:
         processed_audio = self._process_audio(audio)
         all_embeddings = []
 
         with torch.no_grad():
-            for i in tqdm(range(0, len(processed_audio), batch_size)):
+            for i in tqdm(
+                range(0, len(processed_audio), batch_size),
+                disable=not show_progress_bar,
+            ):
                 batch = processed_audio[i : i + batch_size]
 
                 # Process each audio in the batch
@@ -182,6 +186,8 @@ def get_audio_embeddings(
                     sampling_rate=self.sampling_rate,
                     return_tensors="pt",
                     padding=True,
+                    truncation=True,
+                    max_length=30 * self.sampling_rate,  # 30 seconds max
                 ).to(self.device)
 
                 # Get embeddings from the model

diff --git a/mteb/models/mms_models.py b/mteb/models/mms_models.py
@@ -121,13 +121,17 @@ def get_audio_embeddings(
         task_name: str | None = None,
         prompt_type: PromptType | None = None,
         batch_size: int = 4,
+        show_progress_bar: bool = True,
         **kwargs: Any,
     ) -> torch.Tensor:
         processed_audio = self._process_audio(audio)
         all_embeddings = []
 
         with torch.no_grad():
-            for i in tqdm(range(0, len(processed_audio), batch_size)):
+            for i in tqdm(
+                range(0, len(processed_audio), batch_size),
+                disable=not show_progress_bar,
+            ):
                 batch = processed_audio[i : i + batch_size]
 
                 batch_tensor = self._pad_audio_batch(batch)
@@ -142,6 +146,8 @@ def get_audio_embeddings(
                     sampling_rate=self.sampling_rate,
                     return_tensors="pt",
                     padding="longest",
+                    truncation=True,
+                    max_length=30 * self.sampling_rate,  # 30 seconds max
                     return_attention_mask=True,
                 ).to(self.device)
 

diff --git a/mteb/models/msclap_models.py b/mteb/models/msclap_models.py
@@ -93,6 +93,11 @@ def _handle_batch(
                             )
                             audio_array = resampler(audio_array)
 
+                        # Apply audio truncation (30 seconds max)
+                        max_length = 30 * self.sampling_rate  # 30 seconds
+                        if audio_array.shape[-1] > max_length:
+                            audio_array = audio_array[..., :max_length]
+
                         # Only squeeze here, don't call _convert_audio again
                         waveforms.append(audio_array.squeeze())
                     elif "path" in item:
@@ -107,14 +112,27 @@ def _handle_batch(
     def _convert_audio(self, audio: AudioData) -> torch.Tensor:
         if isinstance(audio, np.ndarray):
             audio = torch.from_numpy(audio)
-        return audio.squeeze().float()  # Ensure float32
+        audio = audio.squeeze().float()  # Ensure float32
+
+        # Apply audio truncation (30 seconds max)
+        max_length = 30 * self.sampling_rate  # 30 seconds
+        if audio.shape[-1] > max_length:
+            audio = audio[..., :max_length]
+
+        return audio
 
     def _load_audio_file(self, path: str) -> torch.Tensor:
         waveform, sample_rate = torchaudio.load(path)
         waveform = waveform.float()  # Ensure float32
         if sample_rate != self.sampling_rate:
             resampler = torchaudio.transforms.Resample(sample_rate, self.sampling_rate)
             waveform = resampler(waveform)
+
+        # Apply audio truncation (30 seconds max)
+        max_length = 30 * self.sampling_rate  # 30 seconds
+        if waveform.shape[-1] > max_length:
+            waveform = waveform[..., :max_length]
+
         return waveform.squeeze()
 
     def get_audio_embeddings(
@@ -124,13 +142,16 @@ def get_audio_embeddings(
         task_name: str | None = None,
         prompt_type: PromptType | None = None,
         batch_size: int = 4,
+        show_progress_bar: bool = True,
         **kwargs: Any,
     ) -> np.ndarray:
         all_features = []
         processed_audio = self._process_audio(audio)
 
         for i in tqdm(
-            range(0, len(processed_audio), batch_size), desc="Processing audio batches"
+            range(0, len(processed_audio), batch_size),
+            desc="Processing audio batches",
+            disable=not show_progress_bar,
         ):
             batch = processed_audio[i : i + batch_size]
 

diff --git a/mteb/models/muq_mulan_model.py b/mteb/models/muq_mulan_model.py
@@ -90,7 +90,14 @@ def _convert_audio(self, audio: AudioData) -> torch.Tensor:
         """Convert audio data to torch tensor."""
         if isinstance(audio, np.ndarray):
             audio = torch.from_numpy(audio)
-        return audio.squeeze().float()  # Ensure float32
+        audio = audio.squeeze().float()  # Ensure float32
+
+        # Apply audio truncation (30 seconds max)
+        max_length = 30 * self.target_sampling_rate  # 30 seconds
+        if audio.shape[-1] > max_length:
+            audio = audio[..., :max_length]
+
+        return audio
 
     def _load_audio_file(self, path: str) -> torch.Tensor:
         """Load audio file and resample to target sampling rate."""
@@ -109,17 +116,21 @@ def get_audio_embeddings(
         self,
         audio: AudioBatch,
         *,
+        show_progress_bar: bool = True,
         task_name: str | None = None,
         prompt_type: PromptType | None = None,
         batch_size: int = 4,
         **kwargs: Any,
     ) -> np.ndarray:
         """Get audio embeddings using MuQ-MuLan."""
         all_features = []
+
         processed_audio = self._process_audio(audio)
 
         for i in tqdm(
-            range(0, len(processed_audio), batch_size), desc="Processing audio batches"
+            range(0, len(processed_audio), batch_size),
+            desc="Processing audio batches",
+            disable=not show_progress_bar,
         ):
             batch = processed_audio[i : i + batch_size]
             batch_features = self._process_audio_batch(batch)

diff --git a/mteb/models/qwen2_models.py b/mteb/models/qwen2_models.py
@@ -8,6 +8,7 @@
 import torch
 import torchaudio
 from torch.utils.data import DataLoader
+from tqdm import tqdm
 from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
 
 from mteb.encoder_interface import AudioBatch, AudioData, PromptType
@@ -81,7 +82,14 @@ def _convert_audio_from_numpy(self, audio: AudioData) -> torch.Tensor:
             audio = torch.from_numpy(audio)
         if audio.ndim == 2:
             audio = audio.mean(dim=0)
-        return audio.squeeze()
+        audio = audio.squeeze()
+
+        # Apply audio truncation (30 seconds max)
+        max_length = 30 * self.sampling_rate  # 30 seconds
+        if audio.shape[-1] > max_length:
+            audio = audio[..., :max_length]
+
+        return audio
 
     def _load_audio_file(self, path: str) -> torch.Tensor:
         waveform, sr = torchaudio.load(path)
@@ -90,7 +98,14 @@ def _load_audio_file(self, path: str) -> torch.Tensor:
         if sr != self.sampling_rate:
             resampler = torchaudio.transforms.Resample(sr, self.sampling_rate)
             waveform = resampler(waveform)
-        return waveform.squeeze()
+        waveform = waveform.squeeze()
+
+        # Apply audio truncation (30 seconds max)
+        max_length = 30 * self.sampling_rate  # 30 seconds
+        if waveform.shape[-1] > max_length:
+            waveform = waveform[..., :max_length]
+
+        return waveform
 
     def _pad_audio_batch(self, batch: list[torch.Tensor]) -> torch.Tensor:
         max_len = max(w.shape[0] for w in batch)
@@ -104,13 +119,16 @@ def get_audio_embeddings(
         task_name: str | None = None,
         prompt_type: PromptType | None = None,
         batch_size: int = 4,
+        show_progress_bar: bool = True,
         **kwargs: Any,
     ) -> torch.Tensor:
         processed = self._process_audio(audio)
         embeddings_list: list[torch.Tensor] = []
 
         with torch.no_grad():
-            for i in range(0, len(processed), batch_size):
+            for i in tqdm(
+                range(0, len(processed), batch_size), disable=not show_progress_bar
+            ):
                 batch = processed[i : i + batch_size]
 
                 audio_list = [w.numpy() for w in batch]