Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions scripts/gpt2-tf2/gpt2_1step.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import sys

import numpy as np
import jsonlines as jsonl
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
import tensorflow as tf
from tensorflow.keras import metrics

BATCH_SIZE=1

def get_dataset(fil):
data = []
with jsonl.open(fil) as reader:
for line in reader:
data.append(line['text'])
return data

if len(sys.argv) == 1:
model_size = "Small"
data_dir = '/dockerx/data/'
else:
model_size = sys.argv[1]
data_dir = sys.argv[2]

if model_size == "Small":
model_name = "gpt2"
train_file = data_dir+'small-117M.train.jsonl'
test_file = data_dir+'small-117M.test.jsonl'
elif model_size == "Medium":
model_name = "gpt2-medium"
train_file = data_dir+'medium-345M.train.jsonl'
test_file = data_dir+'medium-345M.test.jsonl'
elif model_size == "Large":
model_name = "gpt2-large"
train_file = data_dir+'large-762M.train.jsonl'
test_file = data_dir+'large-762M.test.jsonl'
elif model_size == "XL":
model_name = 'gpt2-xl'
train_file = data_dir+'xl-1542M.train.jsonl'
test_file = data_dir+'xl-1542M.test.jsonl'
print("Profiling model " + model_name)

tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
def tokenize(data):
data = tokenizer(data[0], return_tensors='tf', padding=True, truncation=True)
return tf.data.Dataset.from_tensor_slices((dict(data), data['input_ids']))

train_dataset = tokenize(get_dataset(train_file)).batch(BATCH_SIZE)
model = TFGPT2LMHeadModel.from_pretrained(model_name)
#Supresses the past_key_values from being expressed in the progress bar
model.config.use_cache=False
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = metrics.SparseCategoricalAccuracy(name='Accuracy')
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer])
model.fit(train_dataset, batch_size=1, epochs=1)

5 changes: 5 additions & 0 deletions scripts/gpt2-tf2/gpt2_profile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import sys
import pandas as pd
profile_dir = sys.argv[1]
df = pd.read_csv(profile_dir+'results.stats.csv')
print('Total time for one step GPT2', sum(df["TotalDurationNs"])*1e-9, 's')
7 changes: 3 additions & 4 deletions scripts/gpt2-tf2/gpt2_train.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import sys
import numpy as np
import jsonlines as jsonl
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
import tensorflow as tf
from tensorflow.keras import metrics
import jsonlines as jsonl

BATCH_SIZE=1

Expand Down Expand Up @@ -69,8 +69,7 @@ def tokenize(data, truncate=False):
print("========================= Compiling Model ============================")
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])
print("========================= Finetuning Model ==================================")
model.fit(train_dataset, batch_size=64, epochs=num_epochs)#, testation_data=test_dataset)
model.fit(train_dataset, batch_size=64, epochs=num_epochs)
print("========================= Evaluating Model ==================================")
info = model.evaluate(test_dataset, verbose=2)
#print("========================= Saving Model ======================================")
#model.save(model_name+'finetuned')

82 changes: 82 additions & 0 deletions scripts/gpt2-tf2/gpt2_train_distributed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import sys
import numpy as np
import jsonlines as jsonl
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
import tensorflow as tf
from tensorflow.keras import metrics


def get_dataset(fil):
data = []
with jsonl.open(fil) as reader:
for line in reader:
data.append(line['text'])
return data

if len(sys.argv) == 1:
model_size = "Small"
data_dir = '/dockerx/data/tf-gpt-2/data/'
num_epochs = 1
num_gpus = len(tf.config.list_physical_devices(device_type='GPU'))
truncate = True
else:
model_size = sys.argv[1]
data_dir = sys.argv[2]
num_epochs = int(sys.argv[3])
num_gpus = int(sys.argv[4])
if int(sys.argv[5]) == 1:
truncate = True
else:
truncate = False

if model_size == "Small":
model_name = "gpt2"
train_file = data_dir+'small-117M-k40.train.jsonl'
valid_file = data_dir+'small-117M-k40.valid.jsonl'
elif model_size == "Medium":
model_name = "gpt2-medium"
train_file = data_dir+'medium-345M-k40.train.jsonl'
valid_file = data_dir+'medium-345M-k40.valid.jsonl'
elif model_size == "Large":
model_name = "gpt2-large"
train_file = data_dir+'large-762M-k40.train.jsonl'
valid_file = data_dir+'large-762M-k40.valid.jsonl'
elif model_size == "XL":
model_name = 'gpt2-xl'
train_file = data_dir+'xl-1542M-k40.train.jsonl'
valid_file = data_dir+'xl-1542M-k40.valid.jsonl'
print("Finetuning model " + model_name)
print("With dataset "+train_file)

def tokenize(data, tokenizer, truncate=False):
if truncate:
data = tokenizer(data[:1000], return_tensors='tf', padding=True, truncation=True)
else:
data = tokenizer(data, return_tensors='tf', padding=True, truncation=True)
return tf.data.Dataset.from_tensor_slices((dict(data), data['input_ids']))

print("============================ Creating Distributed Strategy ===========================")
devices = []
for i in range(num_gpus):
devices.append("GPU:"+str(i))
strategy = tf.distribute.MirroredStrategy(devices=devices)
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
print("============================ Loading model from pretrained and compiling ===========================")
with strategy.scope():
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
print("========================= Loading dataset ========================")
train_dataset = tokenize(get_dataset(train_file),tokenizer, truncate).batch(num_gpus)
valid_dataset = tokenize(get_dataset(valid_file),tokenizer, truncate).batch(num_gpus)
model = TFGPT2LMHeadModel.from_pretrained(model_name)
#Disable past key values
model.config.use_cache=False
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = metrics.SparseCategoricalAccuracy(name='Accuracy')
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])
print("========================= Finetuning Model ==================================")
model.fit(train_dataset, batch_size=64, epochs=num_epochs)
print("========================= Evaluating Model ==================================")
model.evaluate(valid_dataset)

7 changes: 7 additions & 0 deletions scripts/gpt2-tf2/profile_gpt2_train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
model_size=$1
echo $model_size
model_dir=$2
profile_dir=$3
rocprof --stats python3 gpt2_1step.py $model_size $model_dir
python3 gpt2_profile.py $profile_dir