-
Notifications
You must be signed in to change notification settings - Fork 52
Preprocess hf #10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Preprocess hf #10
Changes from all commits
66e61e7
a79988a
db3809b
acda627
d59c85b
7b0cee2
93cb6a0
9f2c442
9fe3bcb
6982c4e
0348b3a
4f060a2
332e8db
0717dab
96daa55
2d36c14
760eed9
baa7b3b
2ceaf70
66beabe
de83476
1b7c96f
5cb878f
ab1c4cc
60fbd1d
732396a
9d80f8a
7457e32
cdbcfc9
94306d1
506fbd4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -447,7 +447,7 @@ void dispatch_scaled_softmax_forward( | |
| int batches, | ||
| int attn_heads) | ||
| { | ||
| TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 4096 ); | ||
| TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 8192 ); | ||
| if (key_seq_len == 0) { | ||
| return; | ||
| } else { | ||
|
|
@@ -523,6 +523,10 @@ void dispatch_scaled_softmax_forward( | |
| scaled_softmax_warp_forward<input_t, output_t, acc_t, 12> | ||
| <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len); | ||
| break; | ||
| case 13: // 8192 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting. Did you double-check that it works as intended? Things can get tricky when kernels grow too big (registers, shared memory, etc. Don't know if relevant here.)
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I admit that I did not double-check this. I relied on the code from: https://github.com/NVIDIA/Megatron-LM/pull/243/files |
||
| scaled_softmax_warp_forward<input_t, output_t, acc_t, 13> | ||
| <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len); | ||
| break; | ||
| default: | ||
| break; | ||
| } | ||
|
|
@@ -541,7 +545,7 @@ void dispatch_scaled_masked_softmax_forward( | |
| int attn_heads, | ||
| int pad_batches) | ||
| { | ||
| TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 4096 ); | ||
| TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 8192 ); | ||
| if (key_seq_len == 0) { | ||
| return; | ||
| } else { | ||
|
|
@@ -617,6 +621,10 @@ void dispatch_scaled_masked_softmax_forward( | |
| scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 12> | ||
| <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); | ||
| break; | ||
| case 13: // 8192 | ||
| scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 13> | ||
| <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); | ||
| break; | ||
| default: | ||
| break; | ||
| } | ||
|
|
@@ -634,7 +642,7 @@ void dispatch_scaled_masked_softmax_backward( | |
| int batches, | ||
| int attn_heads) | ||
| { | ||
| TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 4096 ); | ||
| TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 8192 ); | ||
| if (key_seq_len == 0) { | ||
| return; | ||
| } else { | ||
|
|
@@ -709,6 +717,10 @@ void dispatch_scaled_masked_softmax_backward( | |
| scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 12> | ||
| <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len); | ||
| break; | ||
| case 13: // 8192 | ||
| scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 13> | ||
| <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len); | ||
| break; | ||
|
|
||
| default: | ||
| break; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The local
GPT2Tokenizerimplementation has thisspecial_tokensattribute, but not thePreTrainedTokenizerFastfrom the transformers library.So the code here instead rely on the wrappers around these tokenizers :
Megatron-LM/megatron/tokenizer/tokenizer.py
Line 330 in 7457e32
Megatron-LM/megatron/tokenizer/tokenizer.py
Line 293 in 7457e32