Skip to content

Commit

Permalink
Make datesets and loaders compatible with multiprocessing (rasbt#118)
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt authored Apr 13, 2024
1 parent 9f3f231 commit dd51d4a
Show file tree
Hide file tree
Showing 17 changed files with 140 additions and 116 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ ch05/01_main-chapter-code/model.pth
ch05/01_main-chapter-code/model_and_optimizer.pth
ch05/03_bonus_pretraining_on_gutenberg/model_checkpoints

# Preprocessing output folders
# Datasets
ch05/03_bonus_pretraining_on_gutenberg/gutenberg
ch05/03_bonus_pretraining_on_gutenberg/gutenberg_preprocessed

# Temporary OS-related files
Expand Down
22 changes: 13 additions & 9 deletions appendix-D/01_main-chapter-code/appendix-D.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"torch version: 2.2.1\n"
"torch version: 2.2.2\n"
]
}
],
Expand Down Expand Up @@ -130,7 +130,8 @@
" max_length=GPT_CONFIG_124M[\"context_length\"],\n",
" stride=GPT_CONFIG_124M[\"context_length\"],\n",
" drop_last=True,\n",
" shuffle=True\n",
" shuffle=True,\n",
" num_workers=0\n",
")\n",
"\n",
"val_loader = create_dataloader_v1(\n",
Expand All @@ -139,7 +140,8 @@
" max_length=GPT_CONFIG_124M[\"context_length\"],\n",
" stride=GPT_CONFIG_124M[\"context_length\"],\n",
" drop_last=False,\n",
" shuffle=False\n",
" shuffle=False,\n",
" num_workers=0\n",
")"
]
},
Expand Down Expand Up @@ -500,7 +502,7 @@
"\n",
"\n",
"def train_model(model, train_loader, val_loader, optimizer, device, n_epochs,\n",
" eval_freq, eval_iter, start_context, warmup_steps=10,\n",
" eval_freq, eval_iter, start_context, tokenizer, warmup_steps=10,\n",
" initial_lr=3e-05, min_lr=1e-6):\n",
"\n",
" train_losses, val_losses, track_tokens_seen, track_lrs = [], [], [], []\n",
Expand Down Expand Up @@ -562,8 +564,7 @@
"\n",
" # Generate and print a sample from the model to monitor progress\n",
" generate_and_print_sample(\n",
" model, train_loader.dataset.tokenizer,\n",
" device, start_context\n",
" model, tokenizer, device, start_context\n",
" )\n",
"\n",
" return train_losses, val_losses, track_tokens_seen, track_lrs"
Expand Down Expand Up @@ -625,18 +626,21 @@
}
],
"source": [
"import tiktoken\n",
"\n",
"torch.manual_seed(123)\n",
"model = GPTModel(GPT_CONFIG_124M)\n",
"model.to(device)\n",
"\n",
"peak_lr = 5e-4\n",
"optimizer = torch.optim.AdamW(model.parameters(), weight_decay=0.1)\n",
"tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
"\n",
"n_epochs = 15\n",
"train_losses, val_losses, tokens_seen, lrs = train_model(\n",
" model, train_loader, val_loader, optimizer, device, n_epochs=n_epochs,\n",
" eval_freq=5, eval_iter=1, start_context=\"Every effort moves you\",\n",
" warmup_steps=10, initial_lr=1e-5, min_lr=1e-5\n",
" tokenizer=tokenizer, warmup_steps=10, initial_lr=1e-5, min_lr=1e-5\n",
")"
]
},
Expand Down Expand Up @@ -705,7 +709,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/jg/tpqyh1fd5js5wsr1d138k3n40000gn/T/ipykernel_34986/3589549395.py:5: UserWarning: The figure layout has changed to tight\n",
"/var/folders/jg/tpqyh1fd5js5wsr1d138k3n40000gn/T/ipykernel_9436/3589549395.py:5: UserWarning: The figure layout has changed to tight\n",
" plt.tight_layout(); plt.savefig(\"3.pdf\")\n"
]
},
Expand Down Expand Up @@ -755,7 +759,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.11.4"
}
},
"nbformat": 4,
Expand Down
7 changes: 3 additions & 4 deletions appendix-D/01_main-chapter-code/previous_chapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,11 @@

class GPTDatasetV1(Dataset):
def __init__(self, txt, tokenizer, max_length, stride):
self.tokenizer = tokenizer
self.input_ids = []
self.target_ids = []

# Tokenize the entire text
token_ids = self.tokenizer.encode(txt)
token_ids = tokenizer.encode(txt)

# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):
Expand All @@ -42,7 +41,7 @@ def __getitem__(self, idx):


def create_dataloader_v1(txt, batch_size=4, max_length=256,
stride=128, shuffle=True, drop_last=True):
stride=128, shuffle=True, drop_last=True, num_workers=0):
# Initialize the tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

Expand All @@ -51,7 +50,7 @@ def create_dataloader_v1(txt, batch_size=4, max_length=256,

# Create dataloader
dataloader = DataLoader(
dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=0)

return dataloader

Expand Down
64 changes: 34 additions & 30 deletions ch02/01_main-chapter-code/ch02.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"torch version: 2.2.1\n",
"torch version: 2.2.2\n",
"tiktoken version: 0.5.1\n"
]
}
Expand Down Expand Up @@ -724,7 +724,7 @@
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[16], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m 3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n",
"Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_to_int[s] \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
"Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpreprocessed\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
"Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
"\u001b[0;31mKeyError\u001b[0m: 'Hello'"
]
Expand Down Expand Up @@ -957,7 +957,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 24,
"id": "ede1d41f-934b-4bf4-8184-54394a257a94",
"metadata": {},
"outputs": [],
Expand All @@ -967,7 +967,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 25,
"id": "48967a77-7d17-42bf-9e92-fc619d63a59e",
"metadata": {},
"outputs": [
Expand All @@ -988,7 +988,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 26,
"id": "6ad3312f-a5f7-4efc-9d7d-8ea09d7b5128",
"metadata": {},
"outputs": [],
Expand All @@ -998,7 +998,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 27,
"id": "5ff2cd85-7cfb-4325-b390-219938589428",
"metadata": {},
"outputs": [
Expand All @@ -1020,7 +1020,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 28,
"id": "d26a48bb-f82e-41a8-a955-a1c9cf9d50ab",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1080,7 +1080,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 29,
"id": "848d5ade-fd1f-46c3-9e31-1426e315c71b",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1111,7 +1111,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 30,
"id": "e84424a7-646d-45b6-99e3-80d15fb761f2",
"metadata": {},
"outputs": [],
Expand All @@ -1121,7 +1121,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 31,
"id": "dfbff852-a92f-48c8-a46d-143a0f109f40",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1154,7 +1154,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 32,
"id": "d97b031e-ed55-409d-95f2-aeb38c6fe366",
"metadata": {},
"outputs": [
Expand All @@ -1179,7 +1179,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 33,
"id": "f57bd746-dcbf-4433-8e24-ee213a8c34a1",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1221,15 +1221,15 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 34,
"id": "e1770134-e7f3-4725-a679-e04c3be48cac",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PyTorch version: 2.1.0\n"
"PyTorch version: 2.2.2\n"
]
}
],
Expand Down Expand Up @@ -1258,7 +1258,7 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 35,
"id": "74b41073-4c9f-46e2-a1bd-d38e4122b375",
"metadata": {},
"outputs": [],
Expand All @@ -1268,12 +1268,11 @@
"\n",
"class GPTDatasetV1(Dataset):\n",
" def __init__(self, txt, tokenizer, max_length, stride):\n",
" self.tokenizer = tokenizer\n",
" self.input_ids = []\n",
" self.target_ids = []\n",
"\n",
" # Tokenize the entire text\n",
" token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
" token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
"\n",
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
" for i in range(0, len(token_ids) - max_length, stride):\n",
Expand All @@ -1291,12 +1290,12 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 36,
"id": "5eb30ebe-97b3-43c5-9ff1-a97d621b3c4e",
"metadata": {},
"outputs": [],
"source": [
"def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True):\n",
"def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
"\n",
" # Initialize the tokenizer\n",
" tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
Expand All @@ -1306,7 +1305,12 @@
"\n",
" # Create dataloader\n",
" dataloader = DataLoader(\n",
" dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)\n",
" dataset,\n",
" batch_size=batch_size,\n",
" shuffle=shuffle,\n",
" drop_last=drop_last,\n",
" num_workers=0\n",
" )\n",
"\n",
" return dataloader"
]
Expand All @@ -1321,7 +1325,7 @@
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 37,
"id": "df31d96c-6bfd-4564-a956-6192242d7579",
"metadata": {},
"outputs": [],
Expand All @@ -1332,7 +1336,7 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 38,
"id": "9226d00c-ad9a-4949-a6e4-9afccfc7214f",
"metadata": {},
"outputs": [
Expand All @@ -1354,7 +1358,7 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": 39,
"id": "10deb4bc-4de1-4d20-921e-4b1c7a0e1a6d",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1398,7 +1402,7 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": 40,
"id": "1916e7a6-f03d-4f09-91a6-d0bdbac5a58c",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1473,7 +1477,7 @@
},
{
"cell_type": "code",
"execution_count": 46,
"execution_count": 41,
"id": "15a6304c-9474-4470-b85d-3991a49fa653",
"metadata": {},
"outputs": [],
Expand All @@ -1491,7 +1495,7 @@
},
{
"cell_type": "code",
"execution_count": 47,
"execution_count": 42,
"id": "93cb2cee-9aa6-4bb8-8977-c65661d16eda",
"metadata": {},
"outputs": [],
Expand All @@ -1513,7 +1517,7 @@
},
{
"cell_type": "code",
"execution_count": 49,
"execution_count": 43,
"id": "a686eb61-e737-4351-8f1c-222913d47468",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1554,7 +1558,7 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 44,
"id": "e43600ba-f287-4746-8ddf-d0f71a9023ca",
"metadata": {},
"outputs": [
Expand All @@ -1581,7 +1585,7 @@
},
{
"cell_type": "code",
"execution_count": 51,
"execution_count": 45,
"id": "50280ead-0363-44c8-8c35-bb885d92c8b7",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1874,7 +1878,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
"version": "3.11.4"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit dd51d4a

Please sign in to comment.