-
Notifications
You must be signed in to change notification settings - Fork 32k
Add SDPA support for T5 Style Models #30375
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
76d7699
Initial commit
bf3fb94
Use contiguous for attn_mask
abdulfatir 1cb1af6
Address comment about duplicated code
abdulfatir ff5142b
Fix
abdulfatir 53d2fbb
Fix style
abdulfatir 151751f
Fix stride issue
abdulfatir a54cea4
Add link to issue
abdulfatir e7499ac
Fix default attention_mask
abdulfatir 43e8035
Fix copies
abdulfatir da98387
Update docs
abdulfatir 4628c89
Use clone instead of copy_
abdulfatir 8748841
Use higher opset version
abdulfatir 0160651
Skip test
abdulfatir 44107ac
fix style
abdulfatir File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -420,6 +420,41 @@ def compute_bias(self, query_length, key_length, device=None): | |||||||||||
| values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, query_length, key_length) | ||||||||||||
| return values | ||||||||||||
|
|
||||||||||||
| def _shape(self, states, batch_size): | ||||||||||||
| """projection""" | ||||||||||||
| return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) | ||||||||||||
|
|
||||||||||||
| def _unshape(self, states, batch_size): | ||||||||||||
| """reshape""" | ||||||||||||
| return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) | ||||||||||||
|
|
||||||||||||
| def _project(self, hidden_states, proj_layer, key_value_states, past_key_value, batch_size): | ||||||||||||
| """projects hidden states correctly to key/query states""" | ||||||||||||
| if key_value_states is None: | ||||||||||||
| # self-attn | ||||||||||||
| # (batch_size, n_heads, seq_length, dim_per_head) | ||||||||||||
| hidden_states = self._shape(proj_layer(hidden_states), batch_size) | ||||||||||||
| elif past_key_value is None: | ||||||||||||
| # cross-attn | ||||||||||||
| # (batch_size, n_heads, seq_length, dim_per_head) | ||||||||||||
| hidden_states = self._shape(proj_layer(key_value_states), batch_size) | ||||||||||||
|
|
||||||||||||
| if past_key_value is not None: | ||||||||||||
| if key_value_states is None: | ||||||||||||
| # self-attn | ||||||||||||
| # (batch_size, n_heads, key_length, dim_per_head) | ||||||||||||
| hidden_states = torch.cat([past_key_value, hidden_states], dim=2) | ||||||||||||
| elif past_key_value.shape[2] != key_value_states.shape[1]: | ||||||||||||
| # checking that the `sequence_length` of the `past_key_value` is the same as | ||||||||||||
| # the provided `key_value_states` to support prefix tuning | ||||||||||||
| # cross-attn | ||||||||||||
| # (batch_size, n_heads, seq_length, dim_per_head) | ||||||||||||
| hidden_states = self._shape(proj_layer(key_value_states), batch_size) | ||||||||||||
| else: | ||||||||||||
| # cross-attn | ||||||||||||
| hidden_states = past_key_value | ||||||||||||
| return hidden_states | ||||||||||||
|
|
||||||||||||
| def forward( | ||||||||||||
| self, | ||||||||||||
| hidden_states, | ||||||||||||
|
|
@@ -451,50 +486,25 @@ def forward( | |||||||||||
|
|
||||||||||||
| key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] | ||||||||||||
|
|
||||||||||||
| def shape(states): | ||||||||||||
| """projection""" | ||||||||||||
| return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) | ||||||||||||
|
|
||||||||||||
| def unshape(states): | ||||||||||||
| """reshape""" | ||||||||||||
| return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) | ||||||||||||
|
|
||||||||||||
| def project(hidden_states, proj_layer, key_value_states, past_key_value): | ||||||||||||
| """projects hidden states correctly to key/query states""" | ||||||||||||
| if key_value_states is None: | ||||||||||||
| # self-attn | ||||||||||||
| # (batch_size, n_heads, seq_length, dim_per_head) | ||||||||||||
| hidden_states = shape(proj_layer(hidden_states)) | ||||||||||||
| elif past_key_value is None: | ||||||||||||
| # cross-attn | ||||||||||||
| # (batch_size, n_heads, seq_length, dim_per_head) | ||||||||||||
| hidden_states = shape(proj_layer(key_value_states)) | ||||||||||||
|
|
||||||||||||
| if past_key_value is not None: | ||||||||||||
| if key_value_states is None: | ||||||||||||
| # self-attn | ||||||||||||
| # (batch_size, n_heads, key_length, dim_per_head) | ||||||||||||
| hidden_states = torch.cat([past_key_value, hidden_states], dim=2) | ||||||||||||
| elif past_key_value.shape[2] != key_value_states.shape[1]: | ||||||||||||
| # checking that the `sequence_length` of the `past_key_value` is the same as | ||||||||||||
| # the provided `key_value_states` to support prefix tuning | ||||||||||||
| # cross-attn | ||||||||||||
| # (batch_size, n_heads, seq_length, dim_per_head) | ||||||||||||
| hidden_states = shape(proj_layer(key_value_states)) | ||||||||||||
| else: | ||||||||||||
| # cross-attn | ||||||||||||
| hidden_states = past_key_value | ||||||||||||
| return hidden_states | ||||||||||||
|
|
||||||||||||
| # get query states | ||||||||||||
| query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) | ||||||||||||
| query_states = self._shape( | ||||||||||||
| self.q(hidden_states), batch_size | ||||||||||||
| ) # (batch_size, n_heads, seq_length, dim_per_head) | ||||||||||||
|
|
||||||||||||
| # get key/value states | ||||||||||||
| key_states = project( | ||||||||||||
| hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None | ||||||||||||
| key_states = self._project( | ||||||||||||
| hidden_states, | ||||||||||||
| self.k, | ||||||||||||
| key_value_states, | ||||||||||||
| past_key_value[0] if past_key_value is not None else None, | ||||||||||||
| batch_size, | ||||||||||||
| ) | ||||||||||||
| value_states = project( | ||||||||||||
| hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None | ||||||||||||
| value_states = self._project( | ||||||||||||
| hidden_states, | ||||||||||||
| self.v, | ||||||||||||
| key_value_states, | ||||||||||||
| past_key_value[1] if past_key_value is not None else None, | ||||||||||||
| batch_size, | ||||||||||||
| ) | ||||||||||||
|
|
||||||||||||
| # compute scores | ||||||||||||
|
|
@@ -539,7 +549,9 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): | |||||||||||
| if layer_head_mask is not None: | ||||||||||||
| attn_weights = attn_weights * layer_head_mask | ||||||||||||
|
|
||||||||||||
| attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) | ||||||||||||
| attn_output = self._unshape( | ||||||||||||
| torch.matmul(attn_weights, value_states), batch_size | ||||||||||||
| ) # (batch_size, seq_length, dim) | ||||||||||||
|
Comment on lines
+552
to
+554
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||
| attn_output = self.o(attn_output) | ||||||||||||
|
|
||||||||||||
| present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None | ||||||||||||
|
|
@@ -1007,6 +1019,7 @@ def unshape(states): | |||||||||||
|
|
||||||||||||
| # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->LongT5 | ||||||||||||
| class LongT5LayerSelfAttention(nn.Module): | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the reason for not adding support for LongT5? |
||||||||||||
| # Ignore copy | ||||||||||||
| def __init__(self, config, has_relative_attention_bias=False): | ||||||||||||
| super().__init__() | ||||||||||||
| self.SelfAttention = LongT5Attention(config, has_relative_attention_bias=has_relative_attention_bias) | ||||||||||||
|
|
@@ -1104,6 +1117,7 @@ def forward( | |||||||||||
|
|
||||||||||||
| # Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->LongT5 | ||||||||||||
| class LongT5LayerCrossAttention(nn.Module): | ||||||||||||
| # Ignore copy | ||||||||||||
| def __init__(self, config): | ||||||||||||
| super().__init__() | ||||||||||||
| self.EncDecAttention = LongT5Attention(config, has_relative_attention_bias=False) | ||||||||||||
|
|
||||||||||||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Library convention is for comments to go on the line above to avoid line splitting