Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add two popular datasets for character level LM #254

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions config/train_enwik8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# train a character-level model on enwik8

out_dir = "out-enwik8"
eval_interval = 1000
eval_iters = 200
log_interval = 100 # don't print too too often

# only save when val improves
always_save_checkpoint = False

# wandb_log = True # override via command line if you like
# wandb_project = 'nanogpt'
# wandb_run_name = 'enwik8'

dataset = "enwik8"
gradient_accumulation_steps = 1
batch_size = 32
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 512
dropout = 0.2

learning_rate = 5e-4
max_iters = 100000
lr_decay_iters = max_iters # make equal to max_iters usually
min_lr = 5e-5 # learning_rate / 10 usually
beta2 = 0.99

warmup_iters = 200 # not super necessary potentially

# on macbook also add
# device = 'cpu' # run on cpu only
compile = True # do not torch compile the model
# init_from = 'resume'
# eval_only = True
38 changes: 38 additions & 0 deletions config/train_text8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# train a character-level model on text8

out_dir = "out-text8"
eval_interval = 1000
eval_iters = 200
log_interval = 100 # don't print too too often

# only save when val improves
always_save_checkpoint = False

# wandb_log = True # override via command line if you like
# wandb_project = 'nanogpt'
# wandb_run_name = 'text8'

dataset = "text8"
gradient_accumulation_steps = 1
batch_size = 32
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 512
dropout = 0.2

learning_rate = 5e-4
max_iters = 100000
lr_decay_iters = max_iters # make equal to max_iters usually
min_lr = 5e-5 # learning_rate / 10 usually
beta2 = 0.99

warmup_iters = 200 # not super necessary potentially

# on macbook also add
# device = 'cpu' # run on cpu only
compile = True # do not torch compile the model
# init_from = 'resume'
# eval_only = True
75 changes: 75 additions & 0 deletions data/enwik8/prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""
Prepare the enwik8 dataset for character-level language modeling.
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
Will save train.bin, val.bin containing the ids, and meta.pkl containing the
encoder and decoder and some other related info.
"""
import os
import pickle
import requests
import numpy as np

# download the enwik8 dataset
input_file_path = os.path.join(os.path.dirname(__file__), 'enwik8')
if not os.path.exists(input_file_path):
data_url = 'http://mattmahoney.net/dc/enwik8.zip'
r = requests.get(data_url)
with open(os.path.join(os.path.dirname(__file__), 'enwik8.zip'), 'wb') as f:
f.write(r.content)

# unzip the enwik8 dataset
import zipfile
with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'enwik8.zip'), 'r') as zip_ref:
zip_ref.extractall(os.path.dirname(__file__))

with open(input_file_path, 'r', encoding='latin-1') as f:
data = f.read()
print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):
return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# create the train, validation, and test splits
n = len(data)
num_test_chars = 5000000
train_data = data[: -2 * num_test_chars]
val_data = data[-2 * num_test_chars: -num_test_chars]
test_data = data[-num_test_chars:]

# encode all splits to integers
train_ids = encode(train_data)
val_ids = encode(val_data)
test_ids = encode(test_data)

print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")
print(f"test has {len(test_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
test_ids = np.array(test_ids, dtype=np.uint16)

train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
test_ids.tofile(os.path.join(os.path.dirname(__file__), 'test.bin'))

# save the meta information as well, to help us encode/decode later
meta = {
'vocab_size': vocab_size,
'itos': itos,
'stoi': stoi,
}
with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
pickle.dump(meta, f)
76 changes: 76 additions & 0 deletions data/text8/prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""
Prepare the text8 dataset for character-level language modeling.
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
Will save train.bin, val.bin containing the ids, and meta.pkl containing the
encoder and decoder and some other related info.
"""
import os
import pickle
import requests
import numpy as np

# download the text8 dataset
input_file_path = os.path.join(os.path.dirname(__file__), 'text8')
if not os.path.exists(input_file_path):
data_url = 'http://mattmahoney.net/dc/text8.zip'
r = requests.get(data_url)
with open(os.path.join(os.path.dirname(__file__), 'text8.zip'), 'wb') as f:
f.write(r.content)

# unzip the text8 dataset
import zipfile
with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'text8.zip'), 'r') as zip_ref:
zip_ref.extractall(os.path.dirname(__file__))

with open(input_file_path, 'r') as f:
data = f.read()
print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):
return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# create the train, validation, and test splits
n = len(data)
num_test_chars = 5000000
train_data = data[: -2 * num_test_chars]
val_data = data[-2 * num_test_chars: -num_test_chars]
test_data = data[-num_test_chars:]

# encode all splits to integers
train_ids = encode(train_data)
val_ids = encode(val_data)
test_ids = encode(test_data)

print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")
print(f"test has {len(test_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
test_ids = np.array(test_ids, dtype=np.uint16)

train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
test_ids.tofile(os.path.join(os.path.dirname(__file__), 'test.bin'))

# save the meta information as well, to help us encode/decode later
meta = {
'vocab_size': vocab_size,
'itos': itos,
'stoi': stoi,
}
with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
pickle.dump(meta, f)