forked from rasbt/LLMs-from-scratch
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
19 changed files
with
1,131 additions
and
92 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,274 @@ | ||
# This file collects all the relevant code that we covered thus far | ||
# throughout Chapters 2-4 | ||
# This file can be run as a standalone s | ||
|
||
import tiktoken | ||
import torch | ||
import torch.nn as nn | ||
from torch.utils.data import Dataset, DataLoader | ||
|
||
##################################### | ||
# Chapter 2 | ||
##################################### | ||
|
||
|
||
class GPTDatasetV1(Dataset): | ||
def __init__(self, txt, tokenizer, max_length, stride): | ||
self.tokenizer = tokenizer | ||
self.input_ids = [] | ||
self.target_ids = [] | ||
|
||
# Tokenize the entire text | ||
token_ids = tokenizer.encode(txt) | ||
|
||
# Use a sliding window to chunk the book into overlapping sequences of max_length | ||
for i in range(0, len(token_ids) - max_length, stride): | ||
input_chunk = token_ids[i:i + max_length] | ||
target_chunk = token_ids[i + 1: i + max_length + 1] | ||
self.input_ids.append(torch.tensor(input_chunk)) | ||
self.target_ids.append(torch.tensor(target_chunk)) | ||
|
||
def __len__(self): | ||
return len(self.input_ids) | ||
|
||
def __getitem__(self, idx): | ||
return self.input_ids[idx], self.target_ids[idx] | ||
|
||
|
||
def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=True): | ||
# Initialize the tokenizer | ||
tokenizer = tiktoken.get_encoding("gpt2") | ||
|
||
# Create dataset | ||
dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) | ||
|
||
# Create dataloader | ||
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) | ||
|
||
return dataloader | ||
|
||
|
||
##################################### | ||
# Chapter 3 | ||
##################################### | ||
class MultiHeadAttention(nn.Module): | ||
def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False): | ||
super().__init__() | ||
assert d_out % num_heads == 0, "d_out must be divisible by n_heads" | ||
|
||
self.d_out = d_out | ||
self.num_heads = num_heads | ||
self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim | ||
|
||
self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) | ||
self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) | ||
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) | ||
self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs | ||
self.dropout = nn.Dropout(dropout) | ||
self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1)) | ||
|
||
def forward(self, x): | ||
b, num_tokens, d_in = x.shape | ||
|
||
keys = self.W_key(x) # Shape: (b, num_tokens, d_out) | ||
queries = self.W_query(x) | ||
values = self.W_value(x) | ||
|
||
# We implicitly split the matrix by adding a `num_heads` dimension | ||
# Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) | ||
keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) | ||
values = values.view(b, num_tokens, self.num_heads, self.head_dim) | ||
queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) | ||
|
||
# Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) | ||
keys = keys.transpose(1, 2) | ||
queries = queries.transpose(1, 2) | ||
values = values.transpose(1, 2) | ||
|
||
# Compute scaled dot-product attention (aka self-attention) with a causal mask | ||
attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head | ||
# Original mask truncated to the number of tokens and converted to boolean | ||
mask_bool = self.mask.bool()[:num_tokens, :num_tokens] | ||
# Unsqueeze the mask twice to match dimensions | ||
mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0) | ||
# Use the unsqueezed mask to fill attention scores | ||
attn_scores.masked_fill_(mask_unsqueezed, -torch.inf) | ||
|
||
attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) | ||
attn_weights = self.dropout(attn_weights) | ||
|
||
# Shape: (b, num_tokens, num_heads, head_dim) | ||
context_vec = (attn_weights @ values).transpose(1, 2) | ||
|
||
# Combine heads, where self.d_out = self.num_heads * self.head_dim | ||
context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out) | ||
context_vec = self.out_proj(context_vec) # optional projection | ||
|
||
return context_vec | ||
|
||
|
||
##################################### | ||
# Chapter 4 | ||
##################################### | ||
class LayerNorm(nn.Module): | ||
def __init__(self, emb_dim): | ||
super().__init__() | ||
self.eps = 1e-5 | ||
self.scale = nn.Parameter(torch.ones(emb_dim)) | ||
self.shift = nn.Parameter(torch.zeros(emb_dim)) | ||
|
||
def forward(self, x): | ||
mean = x.mean(dim=-1, keepdim=True) | ||
var = x.var(dim=-1, keepdim=True, unbiased=False) | ||
norm_x = (x - mean) / torch.sqrt(var + self.eps) | ||
return self.scale * norm_x + self.shift | ||
|
||
|
||
class GELU(nn.Module): | ||
def __init__(self): | ||
super().__init__() | ||
|
||
def forward(self, x): | ||
return 0.5 * x * (1 + torch.tanh( | ||
torch.sqrt(torch.tensor(2.0 / torch.pi)) * | ||
(x + 0.044715 * torch.pow(x, 3)) | ||
)) | ||
|
||
|
||
class FeedForward(nn.Module): | ||
def __init__(self, cfg): | ||
super().__init__() | ||
self.layers = nn.Sequential( | ||
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), | ||
GELU(), | ||
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), | ||
nn.Dropout(cfg["drop_rate"]) | ||
) | ||
|
||
def forward(self, x): | ||
return self.layers(x) | ||
|
||
|
||
class TransformerBlock(nn.Module): | ||
def __init__(self, cfg): | ||
super().__init__() | ||
self.att = MultiHeadAttention( | ||
d_in=cfg["emb_dim"], | ||
d_out=cfg["emb_dim"], | ||
block_size=cfg["ctx_len"], | ||
num_heads=cfg["n_heads"], | ||
dropout=cfg["drop_rate"], | ||
qkv_bias=cfg["qkv_bias"]) | ||
self.ff = FeedForward(cfg) | ||
self.norm1 = LayerNorm(cfg["emb_dim"]) | ||
self.norm2 = LayerNorm(cfg["emb_dim"]) | ||
self.drop_resid = nn.Dropout(cfg["drop_rate"]) | ||
|
||
def forward(self, x): | ||
# Shortcut connection for attention block | ||
shortcut = x | ||
x = self.norm1(x) | ||
x = self.att(x) | ||
x = self.drop_resid(x) | ||
x = x + shortcut # Add the original input back | ||
|
||
# Shortcut connection for feed-forward block | ||
shortcut = x | ||
x = self.norm2(x) | ||
x = self.ff(x) | ||
x = self.drop_resid(x) | ||
x = x + shortcut # Add the original input back | ||
|
||
return x | ||
|
||
|
||
class GPTModel(nn.Module): | ||
def __init__(self, cfg): | ||
super().__init__() | ||
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) | ||
self.pos_emb = nn.Embedding(cfg["ctx_len"], cfg["emb_dim"]) | ||
|
||
# Use a placeholder for TransformerBlock | ||
self.trf_blocks = nn.Sequential( | ||
*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) | ||
|
||
# Use a placeholder for LayerNorm | ||
self.final_norm = LayerNorm(cfg["emb_dim"]) | ||
self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) | ||
|
||
def forward(self, in_idx): | ||
batch_size, seq_len = in_idx.shape | ||
tok_embeds = self.tok_emb(in_idx) | ||
pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) | ||
x = tok_embeds + pos_embeds | ||
x = self.trf_blocks(x) | ||
x = self.final_norm(x) | ||
logits = self.out_head(x) | ||
return logits | ||
|
||
|
||
def generate_text_simple(model, idx, max_new_tokens, context_size): | ||
# idx is (B, T) array of indices in the current context | ||
for _ in range(max_new_tokens): | ||
|
||
# Crop current context if it exceeds the supported context size | ||
# E.g., if LLM supports only 5 tokens, and the context size is 10 | ||
# then only the last 5 tokens are used as context | ||
idx_cond = idx[:, -context_size:] | ||
|
||
# Get the predictions | ||
with torch.no_grad(): | ||
logits = model(idx_cond) | ||
|
||
# Focus only on the last time step | ||
# (batch, n_token, vocab_size) becomes (batch, vocab_size) | ||
logits = logits[:, -1, :] | ||
|
||
# Get the idx of the vocab entry with the highest logits value | ||
idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch, 1) | ||
|
||
# Append sampled index to the running sequence | ||
idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1) | ||
|
||
return idx | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
GPT_CONFIG_124M = { | ||
"vocab_size": 50257, # Vocabulary size | ||
"ctx_len": 1024, # Context length | ||
"emb_dim": 768, # Embedding dimension | ||
"n_heads": 12, # Number of attention heads | ||
"n_layers": 12, # Number of layers | ||
"drop_rate": 0.1, # Dropout rate | ||
"qkv_bias": False # Query-Key-Value bias | ||
} | ||
|
||
torch.manual_seed(123) | ||
model = GPTModel(GPT_CONFIG_124M) | ||
model.eval() # disable dropout | ||
|
||
start_context = "Hello, I am" | ||
|
||
tokenizer = tiktoken.get_encoding("gpt2") | ||
encoded = tokenizer.encode(start_context) | ||
encoded_tensor = torch.tensor(encoded).unsqueeze(0) | ||
|
||
print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}") | ||
print("\nInput text:", start_context) | ||
print("Encoded input text:", encoded) | ||
print("encoded_tensor.shape:", encoded_tensor.shape) | ||
|
||
out = generate_text_simple( | ||
model=model, | ||
idx=encoded_tensor, | ||
max_new_tokens=10, | ||
context_size=GPT_CONFIG_124M["ctx_len"] | ||
) | ||
decoded_text = tokenizer.decode(out.squeeze(0).tolist()) | ||
|
||
print(f"\n\n{50*'='}\n{22*' '}OUT\n{50*'='}") | ||
print("\nOutput:", out) | ||
print("Output length:", len(out[0])) | ||
print("Output text:", decoded_text) |