From cec6bf248c9d65f83e3862970c1cfc9caba6c22d Mon Sep 17 00:00:00 2001 From: Steven I Reeves Date: Tue, 22 Jun 2021 23:33:20 +0000 Subject: [PATCH 1/2] Added Distributed Script, Updated Finetuning Script, Added Profiling Script and Mechanism --- scripts/gpt2-tf2/gpt2_1step.py | 58 +++++++++++++++ scripts/gpt2-tf2/gpt2_profile.py | 5 ++ scripts/gpt2-tf2/gpt2_train.py | 8 +-- scripts/gpt2-tf2/gpt2_train_distributed.py | 83 ++++++++++++++++++++++ scripts/gpt2-tf2/profile_gpt2_train.sh | 7 ++ 5 files changed, 157 insertions(+), 4 deletions(-) create mode 100644 scripts/gpt2-tf2/gpt2_1step.py create mode 100644 scripts/gpt2-tf2/gpt2_profile.py create mode 100644 scripts/gpt2-tf2/gpt2_train_distributed.py create mode 100644 scripts/gpt2-tf2/profile_gpt2_train.sh diff --git a/scripts/gpt2-tf2/gpt2_1step.py b/scripts/gpt2-tf2/gpt2_1step.py new file mode 100644 index 000000000000..0b2d81cba510 --- /dev/null +++ b/scripts/gpt2-tf2/gpt2_1step.py @@ -0,0 +1,58 @@ +import sys + +import numpy as np +import jsonlines as jsonl +from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel +import tensorflow as tf +from tensorflow.keras import metrics + +BATCH_SIZE=1 + +def get_dataset(fil): + data = [] + with jsonl.open(fil) as reader: + for line in reader: + data.append(line['text']) + return data + +if len(sys.argv) == 1: + model_size = "Small" + data_dir = '/dockerx/data/' +else: + model_size = sys.argv[1] + data_dir = sys.argv[2] + +if model_size == "Small": + model_name = "gpt2" + train_file = data_dir+'small-117M.train.jsonl' + test_file = data_dir+'small-117M.test.jsonl' +elif model_size == "Medium": + model_name = "gpt2-medium" + train_file = data_dir+'medium-345M.train.jsonl' + test_file = data_dir+'medium-345M.test.jsonl' +elif model_size == "Large": + model_name = "gpt2-large" + train_file = data_dir+'large-762M.train.jsonl' + test_file = data_dir+'large-762M.test.jsonl' +elif model_size == "XL": + model_name = 'gpt2-xl' + train_file = data_dir+'xl-1542M.train.jsonl' + test_file = data_dir+'xl-1542M.test.jsonl' +print("Profiling model " + model_name) + +tokenizer = GPT2TokenizerFast.from_pretrained(model_name) +tokenizer.pad_token = tokenizer.eos_token +def tokenize(data): + data = tokenizer(data[0], return_tensors='tf', padding=True, truncation=True) + return tf.data.Dataset.from_tensor_slices((dict(data), data['input_ids'])) + +train_dataset = tokenize(get_dataset(train_file)).batch(BATCH_SIZE) +model = TFGPT2LMHeadModel.from_pretrained(model_name) +#Supresses the past_key_values from being expressed in the progress bar +model.config.use_cache=False +optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) +loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) +metric = metrics.SparseCategoricalAccuracy(name='Accuracy') +model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer]) +model.fit(train_dataset, batch_size=1, epochs=1) + diff --git a/scripts/gpt2-tf2/gpt2_profile.py b/scripts/gpt2-tf2/gpt2_profile.py new file mode 100644 index 000000000000..4a37be8f3ad8 --- /dev/null +++ b/scripts/gpt2-tf2/gpt2_profile.py @@ -0,0 +1,5 @@ +import sys +import pandas as pd +profile_dir = sys.argv[1] +df = pd.read_csv(profile_dir+'results.stats.csv') +print('Total time for one step GPT2', sum(df["TotalDurationNs"])*1e-9, 's') diff --git a/scripts/gpt2-tf2/gpt2_train.py b/scripts/gpt2-tf2/gpt2_train.py index cebcb06a0623..b17eab879588 100644 --- a/scripts/gpt2-tf2/gpt2_train.py +++ b/scripts/gpt2-tf2/gpt2_train.py @@ -1,9 +1,10 @@ import sys + import numpy as np +import jsonlines as jsonl from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel import tensorflow as tf from tensorflow.keras import metrics -import jsonlines as jsonl BATCH_SIZE=1 @@ -69,8 +70,7 @@ def tokenize(data, truncate=False): print("========================= Compiling Model ============================") model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric]) print("========================= Finetuning Model ==================================") -model.fit(train_dataset, batch_size=64, epochs=num_epochs)#, testation_data=test_dataset) +model.fit(train_dataset, batch_size=64, epochs=num_epochs) print("========================= Evaluating Model ==================================") info = model.evaluate(test_dataset, verbose=2) -#print("========================= Saving Model ======================================") -#model.save(model_name+'finetuned') + diff --git a/scripts/gpt2-tf2/gpt2_train_distributed.py b/scripts/gpt2-tf2/gpt2_train_distributed.py new file mode 100644 index 000000000000..1956989ecd24 --- /dev/null +++ b/scripts/gpt2-tf2/gpt2_train_distributed.py @@ -0,0 +1,83 @@ +import sys + +import numpy as np +import jsonlines as jsonl +from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel +import tensorflow as tf +from tensorflow.keras import metrics + + +def get_dataset(fil): + data = [] + with jsonl.open(fil) as reader: + for line in reader: + data.append(line['text']) + return data + +if len(sys.argv) == 1: + model_size = "Small" + data_dir = '/dockerx/data/tf-gpt-2/data/' + num_epochs = 1 + num_gpus = len(tf.config.list_physical_devices(device_type='GPU')) + truncate = True +else: + model_size = sys.argv[1] + data_dir = sys.argv[2] + num_epochs = int(sys.argv[3]) + num_gpus = int(sys.argv[4]) + if int(sys.argv[5]) == 1: + truncate = True + else: + truncate = False + +if model_size == "Small": + model_name = "gpt2" + train_file = data_dir+'small-117M-k40.train.jsonl' + valid_file = data_dir+'small-117M-k40.valid.jsonl' +elif model_size == "Medium": + model_name = "gpt2-medium" + train_file = data_dir+'medium-345M-k40.train.jsonl' + valid_file = data_dir+'medium-345M-k40.valid.jsonl' +elif model_size == "Large": + model_name = "gpt2-large" + train_file = data_dir+'large-762M-k40.train.jsonl' + valid_file = data_dir+'large-762M-k40.valid.jsonl' +elif model_size == "XL": + model_name = 'gpt2-xl' + train_file = data_dir+'xl-1542M-k40.train.jsonl' + valid_file = data_dir+'xl-1542M-k40.valid.jsonl' +print("Finetuning model " + model_name) +print("With dataset "+train_file) + +def tokenize(data, tokenizer, truncate=False): + if truncate: + data = tokenizer(data[:1000], return_tensors='tf', padding=True, truncation=True) + else: + data = tokenizer(data, return_tensors='tf', padding=True, truncation=True) + return tf.data.Dataset.from_tensor_slices((dict(data), data['input_ids'])) + +print("============================ Creating Distributed Strategy ===========================") +devices = [] +for i in range(num_gpus): + devices.append("GPU:"+str(i)) +strategy = tf.distribute.MirroredStrategy(devices=devices) +print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) +print("============================ Loading model from pretrained and compiling ===========================") +with strategy.scope(): + tokenizer = GPT2TokenizerFast.from_pretrained(model_name) + tokenizer.pad_token = tokenizer.eos_token + print("========================= Loading dataset ========================") + train_dataset = tokenize(get_dataset(train_file),tokenizer, truncate).batch(num_gpus) + valid_dataset = tokenize(get_dataset(valid_file),tokenizer, truncate).batch(num_gpus) + model = TFGPT2LMHeadModel.from_pretrained(model_name) + #Disable past key values + model.config.use_cache=False + optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + metric = metrics.SparseCategoricalAccuracy(name='Accuracy') + model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric]) +print("========================= Finetuning Model ==================================") +model.fit(train_dataset, batch_size=64, epochs=num_epochs) +print("========================= Evaluating Model ==================================") +model.evaluate(valid_dataset) + diff --git a/scripts/gpt2-tf2/profile_gpt2_train.sh b/scripts/gpt2-tf2/profile_gpt2_train.sh new file mode 100644 index 000000000000..8337bd691376 --- /dev/null +++ b/scripts/gpt2-tf2/profile_gpt2_train.sh @@ -0,0 +1,7 @@ +#!/bin/bash +model_size=$1 +echo $model_size +model_dir=$2 +profile_dir=$3 +rocprof --stats python3 gpt2_1step.py $model_size $model_dir +python3 gpt2_profile.py $profile_dir From 95080f2569a180dfc9bbb2d749faac757b1b2bc5 Mon Sep 17 00:00:00 2001 From: Steven I Reeves Date: Tue, 29 Jun 2021 20:15:40 +0000 Subject: [PATCH 2/2] Removing newline after import sys. --- scripts/gpt2-tf2/gpt2_train.py | 1 - scripts/gpt2-tf2/gpt2_train_distributed.py | 1 - 2 files changed, 2 deletions(-) diff --git a/scripts/gpt2-tf2/gpt2_train.py b/scripts/gpt2-tf2/gpt2_train.py index b17eab879588..2208de0e3e73 100644 --- a/scripts/gpt2-tf2/gpt2_train.py +++ b/scripts/gpt2-tf2/gpt2_train.py @@ -1,5 +1,4 @@ import sys - import numpy as np import jsonlines as jsonl from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel diff --git a/scripts/gpt2-tf2/gpt2_train_distributed.py b/scripts/gpt2-tf2/gpt2_train_distributed.py index 1956989ecd24..771b7c4fad4f 100644 --- a/scripts/gpt2-tf2/gpt2_train_distributed.py +++ b/scripts/gpt2-tf2/gpt2_train_distributed.py @@ -1,5 +1,4 @@ import sys - import numpy as np import jsonlines as jsonl from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel