Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/en/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ Flax), PyTorch, and/or TensorFlow.
| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ |
| Wav2Vec2-Conformer | ❌ | ❌ | ✅ | ❌ | ❌ |
| WavLM | ❌ | ❌ | ✅ | ❌ | ❌ |
| Whisper | ✅ | | ✅ | ✅ | ✅ |
| Whisper | ✅ | | ✅ | ✅ | ✅ |
| X-CLIP | ❌ | ❌ | ✅ | ❌ | ❌ |
| X-MOD | ❌ | ❌ | ✅ | ❌ | ❌ |
| XGLM | ✅ | ✅ | ✅ | ✅ | ✅ |
Expand Down
9 changes: 9 additions & 0 deletions docs/source/en/model_doc/whisper.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@ The original code can be found [here](https://github.com/openai/whisper).
- create_token_type_ids_from_sequences
- save_vocabulary

## WhisperTokenizerFast

[[autodoc]] WhisperTokenizerFast
- set_prefix_tokens
- build_inputs_with_special_tokens
- get_special_tokens_mask
- create_token_type_ids_from_sequences
- save_vocabulary

## WhisperFeatureExtractor

[[autodoc]] WhisperFeatureExtractor
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,7 @@
_import_structure["models.splinter"].append("SplinterTokenizerFast")
_import_structure["models.squeezebert"].append("SqueezeBertTokenizerFast")
_import_structure["models.t5"].append("T5TokenizerFast")
_import_structure["models.whisper"].append("WhisperTokenizerFast")
_import_structure["models.xglm"].append("XGLMTokenizerFast")
_import_structure["models.xlm_roberta"].append("XLMRobertaTokenizerFast")
_import_structure["models.xlnet"].append("XLNetTokenizerFast")
Expand Down Expand Up @@ -4278,6 +4279,7 @@
from .models.splinter import SplinterTokenizerFast
from .models.squeezebert import SqueezeBertTokenizerFast
from .models.t5 import T5TokenizerFast
from .models.whisper import WhisperTokenizerFast
from .models.xglm import XGLMTokenizerFast
from .models.xlm_roberta import XLMRobertaTokenizerFast
from .models.xlnet import XLNetTokenizerFast
Expand Down
39 changes: 38 additions & 1 deletion src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def converted(self) -> Tokenizer:
bos = self.original_tokenizer.bos_token
bos_token_id = self.original_tokenizer.bos_token_id
tokenizer.post_processor = processors.TemplateProcessing(
single=f"{bos}:0 $A:0", # token_type_id is 2 for Funnel transformer
single=f"{bos}:0 $A:0",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure there's a reason why this is modified?

Copy link
Contributor Author

@jonatanklosko jonatanklosko Jan 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this comment is a leftover from copying the FunnelConverter, because note that the template doesn't have :2 token type id anywhere, which is the case here:

single=f"{cls}:2 $A:0 {sep}:0", # token_type_id is 2 for Funnel transformer

(I just noticed that when adding the WhisperConverter based on the GPT2 one)

pair=f"{bos}:0 $A:0 $B:1",
special_tokens=[
(bos, bos_token_id),
Expand Down Expand Up @@ -891,6 +891,42 @@ def post_processor(self):
)


class WhisperConverter(Converter):
def converted(self) -> Tokenizer:
vocab = self.original_tokenizer.encoder
merges = list(self.original_tokenizer.bpe_ranks.keys())

tokenizer = Tokenizer(
BPE(
vocab=vocab,
merges=merges,
dropout=None,
continuing_subword_prefix="",
end_of_word_suffix="",
fuse_unk=False,
)
)

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=self.original_tokenizer.add_prefix_space)
tokenizer.decoder = decoders.ByteLevel()

prefix_token_ids = self.original_tokenizer.prefix_tokens
prefixes = self.original_tokenizer.convert_ids_to_tokens(prefix_token_ids)
eos = self.original_tokenizer.eos_token
eos_token_id = self.original_tokenizer.eos_token_id
prefix_template = " ".join([f"{token}:0" for token in prefixes])
tokenizer.post_processor = processors.TemplateProcessing(
single=f"{prefix_template} $A:0 {eos}:0",
pair=f"{prefix_template} $A:0 $B:1 {eos}:1",
special_tokens=[
(eos, eos_token_id),
*zip(prefixes, prefix_token_ids),
],
)

return tokenizer


class BigBirdConverter(SpmConverter):
def post_processor(self):
return processors.TemplateProcessing(
Expand Down Expand Up @@ -1127,6 +1163,7 @@ def converted(self) -> Tokenizer:
"RoFormerTokenizer": RoFormerConverter,
"SqueezeBertTokenizer": BertConverter,
"T5Tokenizer": T5Converter,
"WhisperTokenizer": WhisperConverter,
"XLMRobertaTokenizer": XLMRobertaConverter,
"XLNetTokenizer": XLNetConverter,
"SplinterTokenizer": SplinterConverter,
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@
("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
("whisper", ("WhisperTokenizer" if is_sentencepiece_available() else None, None)),
("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)),
("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
(
"xglm",
Expand Down
16 changes: 16 additions & 0 deletions src/transformers/models/whisper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
_LazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)

Expand All @@ -29,6 +30,13 @@
"tokenization_whisper": ["WhisperTokenizer"],
}

try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_whisper_fast"] = ["WhisperTokenizerFast"]

try:
if not is_torch_available():
Expand Down Expand Up @@ -75,6 +83,14 @@
from .processing_whisper import WhisperProcessor
from .tokenization_whisper import WhisperTokenizer

try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_whisper_fast import WhisperTokenizerFast

try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
Expand Down
Loading