diff --git a/mteb/models/ast_model.py b/mteb/models/ast_model.py index 48b56585a5..966877cc91 100644 --- a/mteb/models/ast_model.py +++ b/mteb/models/ast_model.py @@ -93,13 +93,17 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in tqdm(range(0, len(processed_audio), batch_size)): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] # AST processes raw waveforms directly through its feature extractor diff --git a/mteb/models/clap_models.py b/mteb/models/clap_models.py index 6f03c451a6..a0f29d2ff3 100644 --- a/mteb/models/clap_models.py +++ b/mteb/models/clap_models.py @@ -109,13 +109,16 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> np.ndarray: all_features = [] processed_audio = self._process_audio(audio) for i in tqdm( - range(0, len(processed_audio), batch_size), desc="Processing audio batches" + range(0, len(processed_audio), batch_size), + desc="Processing audio batches", + disable=not show_progress_bar, ): batch = processed_audio[i : i + batch_size] batch_arrays = [tensor.numpy() for tensor in batch] @@ -125,6 +128,8 @@ def get_audio_embeddings( sampling_rate=self.sampling_rate, return_tensors="pt", padding=True, + truncation=True, + max_length=30 * self.sampling_rate, # 30 seconds max ) inputs = {k: v.to(self.device) for k, v in inputs.items()} diff --git a/mteb/models/cnn14_model.py b/mteb/models/cnn14_model.py index 4b6035e7ca..1ac11e2225 100644 --- a/mteb/models/cnn14_model.py +++ b/mteb/models/cnn14_model.py @@ -92,7 +92,14 @@ def _handle_batch( def _convert_audio(self, audio: AudioData) -> torch.Tensor: if isinstance(audio, np.ndarray): audio = torch.from_numpy(audio) - return audio.squeeze() + audio = audio.squeeze() + + # Apply audio truncation (30 seconds max) + max_length = 30 * self.sampling_rate # 30 seconds + if audio.shape[-1] > max_length: + audio = audio[..., :max_length] + + return audio def _load_audio_file(self, path: str) -> torch.Tensor: waveform, sample_rate = torchaudio.load(path) @@ -113,13 +120,17 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in tqdm(range(0, len(processed_audio), batch_size)): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] # Convert batch to tensors and move to device diff --git a/mteb/models/data2vec_models.py b/mteb/models/data2vec_models.py index de2f72e112..6c5403d197 100644 --- a/mteb/models/data2vec_models.py +++ b/mteb/models/data2vec_models.py @@ -103,13 +103,17 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in tqdm(range(0, len(processed_audio), batch_size)): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] # Pre-process audio @@ -125,6 +129,8 @@ def get_audio_embeddings( sampling_rate=self.sampling_rate, return_tensors="pt", padding="longest", + truncation=True, + max_length=30 * self.sampling_rate, # 30 seconds max return_attention_mask=True, ).to(self.device) diff --git a/mteb/models/encodec_model.py b/mteb/models/encodec_model.py index 28dd2c896c..8f0c4d43da 100644 --- a/mteb/models/encodec_model.py +++ b/mteb/models/encodec_model.py @@ -106,13 +106,17 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in tqdm(range(0, len(processed_audio), batch_size)): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] # Process audio through EnCodec's processor @@ -121,6 +125,8 @@ def get_audio_embeddings( sampling_rate=self.sampling_rate, return_tensors="pt", padding=True, + truncation=True, + max_length=30 * self.sampling_rate, # 30 seconds max ).to(self.device) # Get the latent representations directly from the encoder diff --git a/mteb/models/hubert_models.py b/mteb/models/hubert_models.py index 6c1965e940..cdc5e6bb88 100644 --- a/mteb/models/hubert_models.py +++ b/mteb/models/hubert_models.py @@ -103,13 +103,17 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in tqdm(range(0, len(processed_audio), batch_size)): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] # Pre-process like Wav2Vec2 @@ -125,6 +129,8 @@ def get_audio_embeddings( sampling_rate=self.sampling_rate, return_tensors="pt", padding="longest", + truncation=True, + max_length=30 * self.sampling_rate, # 30 seconds max return_attention_mask=True, ).to(self.device) diff --git a/mteb/models/mctct_model.py b/mteb/models/mctct_model.py index ec9273fa6d..ac16b16c61 100644 --- a/mteb/models/mctct_model.py +++ b/mteb/models/mctct_model.py @@ -167,13 +167,17 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in tqdm(range(0, len(processed_audio), batch_size)): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] # Process each audio in the batch @@ -182,6 +186,8 @@ def get_audio_embeddings( sampling_rate=self.sampling_rate, return_tensors="pt", padding=True, + truncation=True, + max_length=30 * self.sampling_rate, # 30 seconds max ).to(self.device) # Get embeddings from the model diff --git a/mteb/models/mms_models.py b/mteb/models/mms_models.py index 760c5725a9..e32be1b065 100644 --- a/mteb/models/mms_models.py +++ b/mteb/models/mms_models.py @@ -121,13 +121,17 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in tqdm(range(0, len(processed_audio), batch_size)): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] batch_tensor = self._pad_audio_batch(batch) @@ -142,6 +146,8 @@ def get_audio_embeddings( sampling_rate=self.sampling_rate, return_tensors="pt", padding="longest", + truncation=True, + max_length=30 * self.sampling_rate, # 30 seconds max return_attention_mask=True, ).to(self.device) diff --git a/mteb/models/msclap_models.py b/mteb/models/msclap_models.py index 761ddfa98d..f9aedc4186 100644 --- a/mteb/models/msclap_models.py +++ b/mteb/models/msclap_models.py @@ -93,6 +93,11 @@ def _handle_batch( ) audio_array = resampler(audio_array) + # Apply audio truncation (30 seconds max) + max_length = 30 * self.sampling_rate # 30 seconds + if audio_array.shape[-1] > max_length: + audio_array = audio_array[..., :max_length] + # Only squeeze here, don't call _convert_audio again waveforms.append(audio_array.squeeze()) elif "path" in item: @@ -107,7 +112,14 @@ def _handle_batch( def _convert_audio(self, audio: AudioData) -> torch.Tensor: if isinstance(audio, np.ndarray): audio = torch.from_numpy(audio) - return audio.squeeze().float() # Ensure float32 + audio = audio.squeeze().float() # Ensure float32 + + # Apply audio truncation (30 seconds max) + max_length = 30 * self.sampling_rate # 30 seconds + if audio.shape[-1] > max_length: + audio = audio[..., :max_length] + + return audio def _load_audio_file(self, path: str) -> torch.Tensor: waveform, sample_rate = torchaudio.load(path) @@ -115,6 +127,12 @@ def _load_audio_file(self, path: str) -> torch.Tensor: if sample_rate != self.sampling_rate: resampler = torchaudio.transforms.Resample(sample_rate, self.sampling_rate) waveform = resampler(waveform) + + # Apply audio truncation (30 seconds max) + max_length = 30 * self.sampling_rate # 30 seconds + if waveform.shape[-1] > max_length: + waveform = waveform[..., :max_length] + return waveform.squeeze() def get_audio_embeddings( @@ -124,13 +142,16 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> np.ndarray: all_features = [] processed_audio = self._process_audio(audio) for i in tqdm( - range(0, len(processed_audio), batch_size), desc="Processing audio batches" + range(0, len(processed_audio), batch_size), + desc="Processing audio batches", + disable=not show_progress_bar, ): batch = processed_audio[i : i + batch_size] diff --git a/mteb/models/muq_mulan_model.py b/mteb/models/muq_mulan_model.py index ae61b968da..d11a965e98 100644 --- a/mteb/models/muq_mulan_model.py +++ b/mteb/models/muq_mulan_model.py @@ -90,7 +90,14 @@ def _convert_audio(self, audio: AudioData) -> torch.Tensor: """Convert audio data to torch tensor.""" if isinstance(audio, np.ndarray): audio = torch.from_numpy(audio) - return audio.squeeze().float() # Ensure float32 + audio = audio.squeeze().float() # Ensure float32 + + # Apply audio truncation (30 seconds max) + max_length = 30 * self.target_sampling_rate # 30 seconds + if audio.shape[-1] > max_length: + audio = audio[..., :max_length] + + return audio def _load_audio_file(self, path: str) -> torch.Tensor: """Load audio file and resample to target sampling rate.""" @@ -109,6 +116,7 @@ def get_audio_embeddings( self, audio: AudioBatch, *, + show_progress_bar: bool = True, task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, @@ -116,10 +124,13 @@ def get_audio_embeddings( ) -> np.ndarray: """Get audio embeddings using MuQ-MuLan.""" all_features = [] + processed_audio = self._process_audio(audio) for i in tqdm( - range(0, len(processed_audio), batch_size), desc="Processing audio batches" + range(0, len(processed_audio), batch_size), + desc="Processing audio batches", + disable=not show_progress_bar, ): batch = processed_audio[i : i + batch_size] batch_features = self._process_audio_batch(batch) diff --git a/mteb/models/qwen2_models.py b/mteb/models/qwen2_models.py index fbe660df3c..e176d4f31f 100644 --- a/mteb/models/qwen2_models.py +++ b/mteb/models/qwen2_models.py @@ -8,6 +8,7 @@ import torch import torchaudio from torch.utils.data import DataLoader +from tqdm import tqdm from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration from mteb.encoder_interface import AudioBatch, AudioData, PromptType @@ -81,7 +82,14 @@ def _convert_audio_from_numpy(self, audio: AudioData) -> torch.Tensor: audio = torch.from_numpy(audio) if audio.ndim == 2: audio = audio.mean(dim=0) - return audio.squeeze() + audio = audio.squeeze() + + # Apply audio truncation (30 seconds max) + max_length = 30 * self.sampling_rate # 30 seconds + if audio.shape[-1] > max_length: + audio = audio[..., :max_length] + + return audio def _load_audio_file(self, path: str) -> torch.Tensor: waveform, sr = torchaudio.load(path) @@ -90,7 +98,14 @@ def _load_audio_file(self, path: str) -> torch.Tensor: if sr != self.sampling_rate: resampler = torchaudio.transforms.Resample(sr, self.sampling_rate) waveform = resampler(waveform) - return waveform.squeeze() + waveform = waveform.squeeze() + + # Apply audio truncation (30 seconds max) + max_length = 30 * self.sampling_rate # 30 seconds + if waveform.shape[-1] > max_length: + waveform = waveform[..., :max_length] + + return waveform def _pad_audio_batch(self, batch: list[torch.Tensor]) -> torch.Tensor: max_len = max(w.shape[0] for w in batch) @@ -104,13 +119,16 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed = self._process_audio(audio) embeddings_list: list[torch.Tensor] = [] with torch.no_grad(): - for i in range(0, len(processed), batch_size): + for i in tqdm( + range(0, len(processed), batch_size), disable=not show_progress_bar + ): batch = processed[i : i + batch_size] audio_list = [w.numpy() for w in batch] diff --git a/mteb/models/seamlessm4t_models.py b/mteb/models/seamlessm4t_models.py index ce2cca3305..1fb6b3d0dd 100644 --- a/mteb/models/seamlessm4t_models.py +++ b/mteb/models/seamlessm4t_models.py @@ -101,13 +101,17 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in tqdm(range(0, len(processed_audio), batch_size)): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] batch_tensor = self._pad_audio_batch(batch) @@ -117,6 +121,8 @@ def get_audio_embeddings( sampling_rate=self.sampling_rate, return_tensors="pt", padding=True, + truncation=True, + max_length=30 * self.sampling_rate, # 30 seconds max ).to(self.device) # Get encodings through the encoder diff --git a/mteb/models/speecht5_models.py b/mteb/models/speecht5_models.py index 0693377f23..c0d438744f 100644 --- a/mteb/models/speecht5_models.py +++ b/mteb/models/speecht5_models.py @@ -119,13 +119,17 @@ def get_audio_embeddings( prompt_type: PromptType | None = None, batch_size: int = 4, hidden_layer: float = 1.0, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in tqdm(range(0, len(processed_audio), batch_size)): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] batch_tensor = self._pad_audio_batch(batch) @@ -140,6 +144,8 @@ def get_audio_embeddings( sampling_rate=self.sampling_rate, return_tensors="pt", padding="longest", + truncation=True, + max_length=30 * self.sampling_rate, # 30 seconds max return_attention_mask=True, ).to(self.device) diff --git a/mteb/models/unispeech_models.py b/mteb/models/unispeech_models.py index 7bcde20f44..955997b17b 100644 --- a/mteb/models/unispeech_models.py +++ b/mteb/models/unispeech_models.py @@ -103,13 +103,17 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in tqdm(range(0, len(processed_audio), batch_size)): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] # Pre-process like Wav2Vec2 @@ -125,6 +129,8 @@ def get_audio_embeddings( sampling_rate=self.sampling_rate, return_tensors="pt", padding="longest", + truncation=True, + max_length=30 * self.sampling_rate, # 30 seconds max return_attention_mask=True, ).to(self.device) diff --git a/mteb/models/vggish_models.py b/mteb/models/vggish_models.py index 528a42b04d..e132a0c691 100644 --- a/mteb/models/vggish_models.py +++ b/mteb/models/vggish_models.py @@ -60,6 +60,11 @@ def _normalize_audio(self, audio): if audio.ndim > 1: audio = audio.mean(dim=0) + # Apply audio truncation (30 seconds max) + max_length = 30 * self.sampling_rate # 30 seconds + if audio.shape[-1] > max_length: + audio = audio[..., :max_length] + # Normalize to [-1.0, 1.0] if audio.abs().max() > 1.0: audio = audio / audio.abs().max() @@ -156,7 +161,14 @@ def _prepare_input_tensor(self, audio_data): return input_tensor def get_audio_embeddings( - self, audio, *, task_name=None, prompt_type=None, batch_size=4, **kwargs + self, + audio, + *, + task_name=None, + prompt_type=None, + batch_size=4, + show_progress_bar=True, + **kwargs, ): """Generate embeddings for audio inputs.""" processed_audio = self._process_audio(audio) @@ -164,7 +176,9 @@ def get_audio_embeddings( with torch.no_grad(): for i in tqdm( - range(0, len(processed_audio), batch_size), desc="Processing audio" + range(0, len(processed_audio), batch_size), + desc="Processing audio", + disable=not show_progress_bar, ): batch = processed_audio[i : i + batch_size] batch_embeddings = [] diff --git a/mteb/models/wav2clip_model.py b/mteb/models/wav2clip_model.py index 0555cebb1d..b02885b475 100644 --- a/mteb/models/wav2clip_model.py +++ b/mteb/models/wav2clip_model.py @@ -63,6 +63,12 @@ def _handle_batch( item["sampling_rate"], self.sampling_rate ) tensor = resampler(tensor) + + # Apply audio truncation (30 seconds max) + max_length = 30 * self.sampling_rate # 30 seconds + if tensor.shape[-1] > max_length: + tensor = tensor[..., :max_length] + waveforms.append(tensor) # dict with path @@ -99,6 +105,7 @@ def get_audio_embeddings( self, audio: AudioBatch, *, + show_progress_bar: bool = True, task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, @@ -108,7 +115,9 @@ def get_audio_embeddings( if isinstance(audio, DataLoader): # Process each DataLoader batch separately - for batch in tqdm(audio, desc="Processing audio batches"): + for batch in tqdm( + audio, desc="Processing audio batches", disable=not show_progress_bar + ): wavs = self._handle_batch(batch) batch_embeddings = self._process_audio_batch(wavs, batch_size) all_embeddings.extend(batch_embeddings) diff --git a/mteb/models/wav2vec2_models.py b/mteb/models/wav2vec2_models.py index ea7c17d45c..229e315065 100644 --- a/mteb/models/wav2vec2_models.py +++ b/mteb/models/wav2vec2_models.py @@ -159,13 +159,17 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in tqdm(range(0, len(processed_audio), batch_size)): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] inputs = self.feature_extractor( @@ -173,6 +177,8 @@ def get_audio_embeddings( sampling_rate=self.sampling_rate, return_tensors="pt", padding=True, + truncation=True, + max_length=30 * self.sampling_rate, # 30 seconds max return_attention_mask=True, ).to(self.device) diff --git a/mteb/models/wavlm_models.py b/mteb/models/wavlm_models.py index 2c2b117a6e..11fc5dbdaa 100644 --- a/mteb/models/wavlm_models.py +++ b/mteb/models/wavlm_models.py @@ -8,6 +8,7 @@ import torch import torchaudio from torch.utils.data import DataLoader +from tqdm import tqdm from transformers import Wav2Vec2FeatureExtractor, WavLMModel from mteb.encoder_interface import AudioBatch, AudioData, PromptType @@ -108,13 +109,17 @@ def get_audio_embeddings( task_name: str | None = None, prompt_type: PromptType | None = None, batch_size: int = 4, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in range(0, len(processed_audio), batch_size): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] batch_tensor = self._pad_audio_batch(batch) diff --git a/mteb/models/whisper_models.py b/mteb/models/whisper_models.py index 64a740b817..b8e0d7079b 100644 --- a/mteb/models/whisper_models.py +++ b/mteb/models/whisper_models.py @@ -106,13 +106,17 @@ def get_audio_embeddings( prompt_type: PromptType | None = None, batch_size: int = 4, hidden_layer: float = 1.0, + show_progress_bar: bool = True, **kwargs: Any, ) -> torch.Tensor: processed_audio = self._process_audio(audio) all_embeddings = [] with torch.no_grad(): - for i in tqdm(range(0, len(processed_audio), batch_size)): + for i in tqdm( + range(0, len(processed_audio), batch_size), + disable=not show_progress_bar, + ): batch = processed_audio[i : i + batch_size] batch_arrays = [tensor.numpy() for tensor in batch] @@ -121,7 +125,8 @@ def get_audio_embeddings( sampling_rate=self.sampling_rate, return_tensors="pt", padding="max_length", - max_length=None, + truncation=True, + max_length=30 * self.sampling_rate, # 30 seconds max return_attention_mask=True, ).to(self.device) diff --git a/mteb/models/yamnet_models.py b/mteb/models/yamnet_models.py index 7efd8aae18..36d7617682 100644 --- a/mteb/models/yamnet_models.py +++ b/mteb/models/yamnet_models.py @@ -60,6 +60,11 @@ def _normalize_audio(self, audio): if audio.ndim > 1: audio = audio.mean(dim=0) + # Apply audio truncation (30 seconds max) + max_length = 30 * self.sampling_rate # 30 seconds + if audio.shape[-1] > max_length: + audio = audio[..., :max_length] + # Normalize to [-1.0, 1.0] if audio.abs().max() > 1.0: audio = audio / audio.abs().max() @@ -156,7 +161,14 @@ def _prepare_input_tensor(self, audio_data): return input_tensor def get_audio_embeddings( - self, audio, *, task_name=None, prompt_type=None, batch_size=4, **kwargs + self, + audio, + *, + task_name=None, + prompt_type=None, + batch_size=4, + show_progress_bar=True, + **kwargs, ): """Generate embeddings for audio inputs.""" processed_audio = self._process_audio(audio) @@ -164,7 +176,9 @@ def get_audio_embeddings( with torch.no_grad(): for i in tqdm( - range(0, len(processed_audio), batch_size), desc="Processing audio" + range(0, len(processed_audio), batch_size), + desc="Processing audio", + disable=not show_progress_bar, ): batch = processed_audio[i : i + batch_size] batch_embeddings = []