diff --git a/scripts/gpt2-tf2/gpt2_1step.py b/scripts/gpt2-tf2/gpt2_1step.py new file mode 100644 index 000000000000..0b2d81cba510 --- /dev/null +++ b/scripts/gpt2-tf2/gpt2_1step.py @@ -0,0 +1,58 @@ +import sys + +import numpy as np +import jsonlines as jsonl +from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel +import tensorflow as tf +from tensorflow.keras import metrics + +BATCH_SIZE=1 + +def get_dataset(fil): + data = [] + with jsonl.open(fil) as reader: + for line in reader: + data.append(line['text']) + return data + +if len(sys.argv) == 1: + model_size = "Small" + data_dir = '/dockerx/data/' +else: + model_size = sys.argv[1] + data_dir = sys.argv[2] + +if model_size == "Small": + model_name = "gpt2" + train_file = data_dir+'small-117M.train.jsonl' + test_file = data_dir+'small-117M.test.jsonl' +elif model_size == "Medium": + model_name = "gpt2-medium" + train_file = data_dir+'medium-345M.train.jsonl' + test_file = data_dir+'medium-345M.test.jsonl' +elif model_size == "Large": + model_name = "gpt2-large" + train_file = data_dir+'large-762M.train.jsonl' + test_file = data_dir+'large-762M.test.jsonl' +elif model_size == "XL": + model_name = 'gpt2-xl' + train_file = data_dir+'xl-1542M.train.jsonl' + test_file = data_dir+'xl-1542M.test.jsonl' +print("Profiling model " + model_name) + +tokenizer = GPT2TokenizerFast.from_pretrained(model_name) +tokenizer.pad_token = tokenizer.eos_token +def tokenize(data): + data = tokenizer(data[0], return_tensors='tf', padding=True, truncation=True) + return tf.data.Dataset.from_tensor_slices((dict(data), data['input_ids'])) + +train_dataset = tokenize(get_dataset(train_file)).batch(BATCH_SIZE) +model = TFGPT2LMHeadModel.from_pretrained(model_name) +#Supresses the past_key_values from being expressed in the progress bar +model.config.use_cache=False +optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) +loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) +metric = metrics.SparseCategoricalAccuracy(name='Accuracy') +model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer]) +model.fit(train_dataset, batch_size=1, epochs=1) + diff --git a/scripts/gpt2-tf2/gpt2_profile.py b/scripts/gpt2-tf2/gpt2_profile.py new file mode 100644 index 000000000000..4a37be8f3ad8 --- /dev/null +++ b/scripts/gpt2-tf2/gpt2_profile.py @@ -0,0 +1,5 @@ +import sys +import pandas as pd +profile_dir = sys.argv[1] +df = pd.read_csv(profile_dir+'results.stats.csv') +print('Total time for one step GPT2', sum(df["TotalDurationNs"])*1e-9, 's') diff --git a/scripts/gpt2-tf2/gpt2_train.py b/scripts/gpt2-tf2/gpt2_train.py index cebcb06a0623..2208de0e3e73 100644 --- a/scripts/gpt2-tf2/gpt2_train.py +++ b/scripts/gpt2-tf2/gpt2_train.py @@ -1,9 +1,9 @@ import sys import numpy as np +import jsonlines as jsonl from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel import tensorflow as tf from tensorflow.keras import metrics -import jsonlines as jsonl BATCH_SIZE=1 @@ -69,8 +69,7 @@ def tokenize(data, truncate=False): print("========================= Compiling Model ============================") model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric]) print("========================= Finetuning Model ==================================") -model.fit(train_dataset, batch_size=64, epochs=num_epochs)#, testation_data=test_dataset) +model.fit(train_dataset, batch_size=64, epochs=num_epochs) print("========================= Evaluating Model ==================================") info = model.evaluate(test_dataset, verbose=2) -#print("========================= Saving Model ======================================") -#model.save(model_name+'finetuned') + diff --git a/scripts/gpt2-tf2/gpt2_train_distributed.py b/scripts/gpt2-tf2/gpt2_train_distributed.py new file mode 100644 index 000000000000..771b7c4fad4f --- /dev/null +++ b/scripts/gpt2-tf2/gpt2_train_distributed.py @@ -0,0 +1,82 @@ +import sys +import numpy as np +import jsonlines as jsonl +from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel +import tensorflow as tf +from tensorflow.keras import metrics + + +def get_dataset(fil): + data = [] + with jsonl.open(fil) as reader: + for line in reader: + data.append(line['text']) + return data + +if len(sys.argv) == 1: + model_size = "Small" + data_dir = '/dockerx/data/tf-gpt-2/data/' + num_epochs = 1 + num_gpus = len(tf.config.list_physical_devices(device_type='GPU')) + truncate = True +else: + model_size = sys.argv[1] + data_dir = sys.argv[2] + num_epochs = int(sys.argv[3]) + num_gpus = int(sys.argv[4]) + if int(sys.argv[5]) == 1: + truncate = True + else: + truncate = False + +if model_size == "Small": + model_name = "gpt2" + train_file = data_dir+'small-117M-k40.train.jsonl' + valid_file = data_dir+'small-117M-k40.valid.jsonl' +elif model_size == "Medium": + model_name = "gpt2-medium" + train_file = data_dir+'medium-345M-k40.train.jsonl' + valid_file = data_dir+'medium-345M-k40.valid.jsonl' +elif model_size == "Large": + model_name = "gpt2-large" + train_file = data_dir+'large-762M-k40.train.jsonl' + valid_file = data_dir+'large-762M-k40.valid.jsonl' +elif model_size == "XL": + model_name = 'gpt2-xl' + train_file = data_dir+'xl-1542M-k40.train.jsonl' + valid_file = data_dir+'xl-1542M-k40.valid.jsonl' +print("Finetuning model " + model_name) +print("With dataset "+train_file) + +def tokenize(data, tokenizer, truncate=False): + if truncate: + data = tokenizer(data[:1000], return_tensors='tf', padding=True, truncation=True) + else: + data = tokenizer(data, return_tensors='tf', padding=True, truncation=True) + return tf.data.Dataset.from_tensor_slices((dict(data), data['input_ids'])) + +print("============================ Creating Distributed Strategy ===========================") +devices = [] +for i in range(num_gpus): + devices.append("GPU:"+str(i)) +strategy = tf.distribute.MirroredStrategy(devices=devices) +print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) +print("============================ Loading model from pretrained and compiling ===========================") +with strategy.scope(): + tokenizer = GPT2TokenizerFast.from_pretrained(model_name) + tokenizer.pad_token = tokenizer.eos_token + print("========================= Loading dataset ========================") + train_dataset = tokenize(get_dataset(train_file),tokenizer, truncate).batch(num_gpus) + valid_dataset = tokenize(get_dataset(valid_file),tokenizer, truncate).batch(num_gpus) + model = TFGPT2LMHeadModel.from_pretrained(model_name) + #Disable past key values + model.config.use_cache=False + optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + metric = metrics.SparseCategoricalAccuracy(name='Accuracy') + model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric]) +print("========================= Finetuning Model ==================================") +model.fit(train_dataset, batch_size=64, epochs=num_epochs) +print("========================= Evaluating Model ==================================") +model.evaluate(valid_dataset) + diff --git a/scripts/gpt2-tf2/profile_gpt2_train.sh b/scripts/gpt2-tf2/profile_gpt2_train.sh new file mode 100644 index 000000000000..8337bd691376 --- /dev/null +++ b/scripts/gpt2-tf2/profile_gpt2_train.sh @@ -0,0 +1,7 @@ +#!/bin/bash +model_size=$1 +echo $model_size +model_dir=$2 +profile_dir=$3 +rocprof --stats python3 gpt2_1step.py $model_size $model_dir +python3 gpt2_profile.py $profile_dir