-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #15 from Sanster/main
Add QWen model + benchmark results
- Loading branch information
Showing
12 changed files
with
41,100 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,3 +46,4 @@ | |
MptModel, | ||
MptPreTrainedModel, | ||
) | ||
from .qwen import qwen_pos_shift_attention_forward |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .pos_shift import qwen_pos_shift_attention_forward |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
from typing import List, Optional, Tuple | ||
|
||
import torch | ||
|
||
__all__ = ["qwen_pos_shift_attention_forward"] | ||
|
||
|
||
def _rotate_half(x): | ||
from einops import rearrange | ||
|
||
x = rearrange(x, "... (j d) -> ... j d", j=2) | ||
x1, x2 = x.unbind(dim=-2) | ||
return torch.cat((-x2, x1), dim=-1) | ||
|
||
|
||
def apply_rotary_pos_emb(t, freqs): | ||
cos, sin = freqs | ||
rot_dim = freqs[0].shape[-1] | ||
t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:] | ||
t_ = t_.float() | ||
t_pass_ = t_pass_.float() | ||
t_ = (t_ * cos) + (_rotate_half(t_) * sin) | ||
return torch.cat((t_, t_pass_), dim=-1).type_as(t) | ||
|
||
|
||
def qwen_pos_shift_attention_forward( | ||
self, | ||
hidden_states: Optional[Tuple[torch.FloatTensor]], | ||
rotary_pos_emb_list: Optional[List[torch.Tensor]] = None, | ||
registered_causal_mask: Optional[torch.Tensor] = None, | ||
layer_past: Optional[Tuple[torch.Tensor]] = None, | ||
attention_mask: Optional[torch.FloatTensor] = None, | ||
head_mask: Optional[torch.FloatTensor] = None, | ||
encoder_hidden_states: Optional[torch.Tensor] = None, | ||
encoder_attention_mask: Optional[torch.FloatTensor] = None, | ||
output_attentions: Optional[bool] = False, | ||
use_cache: Optional[bool] = False, | ||
): | ||
mixed_x_layer = self.c_attn(hidden_states) | ||
|
||
query, key, value = mixed_x_layer.split(self.split_size, dim=2) | ||
|
||
query = self._split_heads(query, self.num_heads, self.head_dim) | ||
key = self._split_heads(key, self.num_heads, self.head_dim) | ||
value = self._split_heads(value, self.num_heads, self.head_dim) | ||
|
||
if rotary_pos_emb_list is not None: | ||
cur_len = query.shape[1] | ||
if len(rotary_pos_emb_list) == 1: | ||
rotary_pos_emb = rotary_pos_emb_list[0] | ||
rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb] | ||
rotary_pos_emb = (rotary_pos_emb,) * 2 | ||
q_pos_emb, k_pos_emb = rotary_pos_emb | ||
# Slice the pos emb for current inference | ||
query = apply_rotary_pos_emb(query, q_pos_emb) | ||
# key = apply_rotary_pos_emb(key, k_pos_emb) | ||
else: | ||
# TODO: modify batch infer | ||
query_list = [] | ||
key_list = [] | ||
for i, rotary_pos_emb in enumerate(rotary_pos_emb_list): | ||
rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb] | ||
rotary_pos_emb = (rotary_pos_emb,) * 2 | ||
q_pos_emb, k_pos_emb = rotary_pos_emb | ||
# Slice the pos emb for current inference | ||
query_list += [apply_rotary_pos_emb(query[i : i + 1, :, :], q_pos_emb)] | ||
key_list += [apply_rotary_pos_emb(key[i : i + 1, :, :], k_pos_emb)] | ||
query = torch.cat(query_list, dim=0) | ||
key = torch.cat(key_list, dim=0) | ||
|
||
if layer_past is not None: | ||
past_key, past_value = layer_past[0], layer_past[1] | ||
key = torch.cat((past_key, key), dim=1) | ||
value = torch.cat((past_value, value), dim=1) | ||
|
||
if use_cache: | ||
present = (key, value) | ||
else: | ||
present = None | ||
|
||
### Shift pos ### | ||
kv_seq_len = key.size(1) | ||
key_shifted_position_ids = torch.arange(kv_seq_len, dtype=torch.long, device=key.device) | ||
key_rotary_pos_emb = [it[:, key_shifted_position_ids, :, :] for it in rotary_pos_emb_list[0]] | ||
key = apply_rotary_pos_emb(key, key_rotary_pos_emb) | ||
####### | ||
|
||
if self.use_logn_attn and not self.training: | ||
seq_start = key.size(1) - query.size(1) | ||
seq_end = key.size(1) | ||
logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :] | ||
query = query * logn_tensor.expand_as(query) | ||
|
||
query = query.permute(0, 2, 1, 3) | ||
key = key.permute(0, 2, 1, 3) | ||
value = value.permute(0, 2, 1, 3) | ||
attn_output, attn_weight = self._attn( | ||
query, key, value, registered_causal_mask, attention_mask=None, head_mask=head_mask | ||
) | ||
context_layer = self._merge_heads(attn_output, self.num_heads, self.head_dim) | ||
|
||
attn_output = self.c_proj(context_layer) | ||
|
||
outputs = (attn_output, present) | ||
if output_attentions: | ||
outputs += (attn_weight,) | ||
|
||
return outputs |
Oops, something went wrong.