Skip to content

Commit

Permalink
Fix ByteLevel pretokenizer
Browse files Browse the repository at this point in the history
* Re-enable other whisper tests

* Fix `ByteLevel` pretokenizer

Only add prefix space to first word, when option is enabled.
  • Loading branch information
xenova authored Sep 9, 2023
1 parent ad7e875 commit 5216fb4
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 18 deletions.
13 changes: 5 additions & 8 deletions scripts/supported_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,14 +439,11 @@
'openai/whisper-medium.en',
'openai/whisper-large',
'openai/whisper-large-v2',

# TODO: add these models
# https://github.com/huggingface/transformers/issues/26043
# 'NbAiLab/nb-whisper-tiny-beta',
# 'NbAiLab/nb-whisper-base-beta',
# 'NbAiLab/nb-whisper-small-beta',
# 'NbAiLab/nb-whisper-medium-beta',
# 'NbAiLab/nb-whisper-large-beta',
'NbAiLab/nb-whisper-tiny-beta',
'NbAiLab/nb-whisper-base-beta',
'NbAiLab/nb-whisper-small-beta',
'NbAiLab/nb-whisper-medium-beta',
'NbAiLab/nb-whisper-large-beta',
],
'xlm': [
'xlm-clm-ende-1024',
Expand Down
19 changes: 9 additions & 10 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -1229,19 +1229,18 @@ class ByteLevelPreTokenizer extends PreTokenizer {
* @returns {string[]} An array of tokens.
*/
pre_tokenize_text(text) {
// Add a leading space if the option is enabled
if (this.add_prefix_space && !text.startsWith(' ')) {
text = ' ' + text;
}

// Split on whitespace and punctuation
let tokens = this.use_regex ? (text.match(this.pattern) || []) : [text];

return tokens.map(token => {
if (this.add_prefix_space && !token.startsWith(' ')) {
token = ' ' + token;
}

// Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
token = Array.from(this.text_encoder.encode(token), byte => this.byte_encoder[byte]).join('');

return token;
});
// Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
return tokens.map(
token => Array.from(this.text_encoder.encode(token), byte => this.byte_encoder[byte]).join('')
);
}
}

Expand Down
6 changes: 6 additions & 0 deletions tests/generate_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ def generate_tokenizer_tests():
# means the model does not use a tokenizer (e.g., vision models)
continue

try:
# Disable dropout, if the model allows it
tokenizer.backend_tokenizer.model.dropout = 0
except AttributeError:
pass

tokenizer_results = []

shared_texts = TOKENIZER_TEST_DATA["shared"]
Expand Down

0 comments on commit 5216fb4

Please sign in to comment.