diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index f3de341d8895..4822e9e9d7fa 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -440,6 +440,7 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None): truncation=False, padding="longest", return_tensors="pt", + return_attention_mask=True, ) else: if self.type == "seq2seq_whisper" and stride is None: @@ -448,13 +449,16 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None): sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt", return_token_timestamps=True, + return_attention_mask=True, ) extra["num_frames"] = processed.pop("num_frames") else: processed = self.feature_extractor( - inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt" + inputs, + sampling_rate=self.feature_extractor.sampling_rate, + return_tensors="pt", + return_attention_mask=True, ) - if self.torch_dtype is not None: processed = processed.to(dtype=self.torch_dtype) if stride is not None: