diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py index cc45ed1b29fa..04bacbd312d7 100644 --- a/src/transformers/models/whisper/tokenization_whisper.py +++ b/src/transformers/models/whisper/tokenization_whisper.py @@ -479,8 +479,11 @@ def _convert_token_to_id(self, token): return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - return self.decoder.get(index, self.decoder.get(self.unk_token_id)) + """ + Converts an index (integer) in a token (str) using the vocab. Whisper's base tokenizer always decodes OOV + tokens as "", thus we do not use the `unk_token` here. + """ + return self.decoder.get(index, "") def _normalize(self, text): """