Skip to content

Commit

Permalink
Fix BPE bonus materials (#561)
Browse files Browse the repository at this point in the history
* Fix BPE bonus materials

* fix bpe implementation

* update

* Add 'Hello, world. Is this-- a test?' test case

* update link to test file

* update path handling

* update path handling

* fix pytest paths
  • Loading branch information
rasbt authored Mar 8, 2025
1 parent 96ca2fc commit f63f04d
Show file tree
Hide file tree
Showing 5 changed files with 305 additions and 85 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/basic-tests-linux-uv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,9 @@ jobs:
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
- name: Test Selected Bonus Materials
shell: bash
run: |
source .venv/bin/activate
pytest ch02/05_bpe-from-scratch/tests/tests.py
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

# Configs and keys
ch05/07_gpt_to_llama/config.json
ch07/02_dataset-utilities/config.json
Expand Down Expand Up @@ -63,13 +64,16 @@ ch07/01_main-chapter-code/Smalltestmodel-sft-standalone.pth
ch07/01_main-chapter-code/gpt2/

# Datasets
the-verdict.txt

appendix-E/01_main-chapter-code/sms_spam_collection.zip
appendix-E/01_main-chapter-code/sms_spam_collection
appendix-E/01_main-chapter-code/train.csv
appendix-E/01_main-chapter-code/test.csv
appendix-E/01_main-chapter-code/validation.csv

ch02/01_main-chapter-code/number-data.txt
ch02/05_bpe-from-scratch/the-verdict.txt

ch05/03_bonus_pretraining_on_gutenberg/gutenberg
ch05/03_bonus_pretraining_on_gutenberg/gutenberg_preprocessed
Expand Down Expand Up @@ -107,7 +111,9 @@ ch02/05_bpe-from-scratch/bpe_merges.txt
ch02/05_bpe-from-scratch/encoder.json
ch02/05_bpe-from-scratch/vocab.bpe
ch02/05_bpe-from-scratch/vocab.json

encoder.json
vocab.bpe
vocab.json

# Other
ch0?/0?_user_interface/.chainlit/
Expand Down
36 changes: 22 additions & 14 deletions ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"tiktoken version: 0.7.0\n"
"tiktoken version: 0.9.0\n"
]
}
],
Expand Down Expand Up @@ -180,8 +180,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Fetching encoder.json: 1.04Mit [00:00, 4.13Mit/s] \n",
"Fetching vocab.bpe: 457kit [00:00, 2.56Mit/s] \n"
"Fetching encoder.json: 1.04Mit [00:00, 3.69Mit/s] \n",
"Fetching vocab.bpe: 457kit [00:00, 2.53Mit/s] \n"
]
}
],
Expand Down Expand Up @@ -256,10 +256,18 @@
"id": "e9077bf4-f91f-42ad-ab76-f3d89128510e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/sebastian/Developer/LLMs-from-scratch/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"data": {
"text/plain": [
"'4.48.0'"
"'4.49.0'"
]
},
"execution_count": 12,
Expand Down Expand Up @@ -423,7 +431,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[1544, 18798, 11, 995, 13, 1148, 256, 5303, 82, 438, 257, 1332, 30]\n"
"[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
]
}
],
Expand Down Expand Up @@ -451,7 +459,7 @@
"metadata": {},
"outputs": [],
"source": [
"with open('../01_main-chapter-code/the-verdict.txt', 'r', encoding='utf-8') as f:\n",
"with open(\"../01_main-chapter-code/the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
" raw_text = f.read()"
]
},
Expand All @@ -473,7 +481,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"3.39 ms ± 21.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"3.84 ms ± 9.83 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -499,7 +507,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"1.08 ms ± 5.99 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
"901 μs ± 6.27 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
Expand Down Expand Up @@ -532,7 +540,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"10.2 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"11 ms ± 94.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -550,7 +558,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"10 ms ± 36.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"10.8 ms ± 180 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -575,7 +583,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"3.79 ms ± 48.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"3.66 ms ± 3.67 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -593,7 +601,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"3.83 ms ± 58.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"3.77 ms ± 49.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -619,7 +627,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"1.59 ms ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
"9.37 ms ± 50.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -644,7 +652,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.10.16"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit f63f04d

Please sign in to comment.