Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DeepSpeedExamples
44 changes: 21 additions & 23 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,33 @@ jobs:
ln -s /data/Megatron-LM/data DeepSpeedExamples/Megatron-LM/
pip install --user -r DeepSpeedExamples/Megatron-LM/requirements.txt
cd tests/model/
#pytest -s run_sanity_check.py
pytest -s run_sanity_check.py
displayName: 'Model tests'
# BingBertSquad logs
#BingBertSquad logs
- task: PublishPipelineArtifact@1
inputs:
targetPath: '$(Build.SourcesDirectory)/tests/model/BingBertSquad/test/'
artifactName: BingBertSquad_logs
displayName: 'BingBertSquad log uploads'
condition: always()

# Megatron test logs
#- task: PublishPipelineArtifact@1
# inputs:
# targetPath: '$(Build.SourcesDirectory)/tests/model/Megatron_GPT2/test/'
# artifactName: Megatron_GPT2_logs
# displayName: 'Megatron GPT2 log uploads'
# condition: always()

#- task: PublishPipelineArtifact@1
# inputs:
# targetPath: '$(Build.SourcesDirectory)/tests/model/BingBertSquad/test/'
# artifactName: BingBertSquad_logs
# displayName: 'BingBertSquad logs'
# targetPath: '$(Build.SourcesDirectory)/tests/model/Megatron_GPT2/checkpoint_test_logs/'
# artifactName: Megatron_GPT2_checkpoint_logs
# displayName: 'Megatron GPT2 checkpoint log uploads'
# condition: always()


#BingBert logs
#- task: PublishPipelineArtifact@1
# inputs:
Expand All @@ -73,21 +89,3 @@ jobs:
# artifactName: BingBert_checkpoint_logs
# displayName: 'BingBert checkpoint logs'
# condition: always()


# XXX temporarily disabled

# Megatron test logs
#- task: PublishPipelineArtifact@1
# inputs:
# targetPath: '$(Build.SourcesDirectory)/tests/model/Megatron_GPT2/test/'
# artifactName: Megatron_GPT2_logs
# displayName: 'Megatron GPT2 logs'
# condition: always()

#- task: PublishPipelineArtifact@1
# inputs:
# targetPath: '$(Build.SourcesDirectory)/tests/model/Megatron_GPT2/checkpoint_test_logs/'
# artifactName: Megatron_GPT2_checkpoint_logs
# displayName: 'Megatron GPT2 checkpoint logs'
# condition: always()
162 changes: 162 additions & 0 deletions tests/model/BingBertSquad/BingBertSquad_run_func_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.

import unittest
import subprocess
import os
import time
import re
from .BingBertSquad_test_common import BaseTestCase


def grep_loss_from_file(file_name):
loss = 0.0

with open(file_name, 'r') as f:
lines = f.readlines()
line_filter = "bert_squad_progress: step="
match_number = re.compile('loss=([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')

for line in lines:
if line_filter in line:
loss = re.findall(match_number, line)
loss = float(loss[0])

if loss == 0.0:
print("no loss found in file ", file_name)

return loss


class BingBertSquadFuncTestCase(BaseTestCase):
def __init__(self, methodName="DeepSpeed function test on BingBertSquad model"):
super(BingBertSquadFuncTestCase, self).__init__(methodName)

def setUp(self):
self.save_dir = os.getcwd()
new_dir = os.path.dirname(__file__)
if new_dir:
os.chdir(new_dir)

def tearDown(self):
os.chdir(self.save_dir)

def test_gpu4_fp16(self):
test_config = {
"gpus": 4,
"deepspeed": False,
"json": "deepspeed_bsz24_fp16_config.json",
"max_steps": 8,
"max_epoch_steps": 4,
"other_args": "--fp16 --print_steps 1"
}

succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)

def test_gpu1_fp16(self):
test_config = {
"gpus": 1,
"deepspeed": False,
"json": "deepspeed_bsz24_fp16_config.json",
"max_steps": 8,
"max_epoch_steps": 4,
"other_args": "--fp16 --print_steps 1"
}

succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)

def test_gpu4_fp32(self):
test_config = {
"gpus": 4,
"deepspeed": False,
"json": "deepspeed_bsz24_fp32_config.json",
"max_steps": 8,
"max_epoch_steps": 4,
"other_args": "--print_steps 1"
}

succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)

def test_gpu1_fp32(self):
test_config = {
"gpus": 1,
"deepspeed": False,
"json": "deepspeed_bsz24_fp32_config.json",
"max_steps": 8,
"max_epoch_steps": 4,
"other_args": "--print_steps 1"
}

succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)

def run_test(self, test_config, r_tol):
print("\n")
print("{0}: starting......".format(self.id()))

prefix = "BingBertSquad_func"

test_config['other_args'] += f" --max_steps {test_config['max_steps']}"
test_config[
'other_args'] += f" --max_steps_per_epoch {test_config['max_epoch_steps']}"

# baseline run...
test_config["deepspeed"] = False
base_file = self.gen_output_name(test_config, prefix)

# skip baseline run if it exists.
if not self.has_loss_data(base_file):
print("{0}: baseline run.".format(self.id()))
self.run_BingBertSquad_test(test_config, base_file)
else:
print("{0}: baseline exists.".format(self.id()))

# DeepSpeed run...
test_config["deepspeed"] = True
print("{0}: DeepSpeed run.".format(self.id()))
test_file = self.gen_output_name(test_config, prefix)
self.run_BingBertSquad_test(test_config, test_file)

return self.check_parity(base_file, test_file, r_tol)

def has_loss_data(self, file_name):
has_loss = False
if os.path.exists(file_name):
loss = grep_loss_from_file(file_name)
if loss != 0.0:
has_loss = True

return has_loss

def check_parity(self, base_file, test_file, r_tol):
base_loss = grep_loss_from_file(base_file)
test_loss = grep_loss_from_file(test_file)

print("baseline loss: {0}, test loss: {1}".format(base_loss, test_loss))

if base_loss == 0.0 or test_loss == 0.0:
return False

if abs((base_loss - test_loss) / base_loss) > r_tol:
return False

return True


def suite():
suite = unittest.TestSuite()
suite.addTest(BingBertSquadFuncTestCase('test_gpu4_fp16'))
suite.addTest(BingBertSquadFuncTestCase('test_gpu1_fp16'))
suite.addTest(BingBertSquadFuncTestCase('test_gpu4_fp32'))
suite.addTest(BingBertSquadFuncTestCase('test_gpu1_fp32'))
return suite


if __name__ == '__main__':
runner = unittest.TextTestRunner(failfast=True)
runner.run(suite())
69 changes: 69 additions & 0 deletions tests/model/BingBertSquad/BingBertSquad_test_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#

import unittest
import subprocess
import os
import time
import re


class BaseTestCase(unittest.TestCase):
def __init__(self, methodName="DeepSpeed performance test"):
super(BaseTestCase, self).__init__(methodName)
self.test_dir = "./test"
self.baseline_dir = "./baseline"
self.timestr = time.strftime("%Y%m%d-%H%M%S")

def gen_output_name(self, test_config, prefix):
other_args = test_config["other_args"] if "other_args" in test_config else ""
zero_args = "_zero" if "zero" in test_config and test_config["zero"] else ""
other_args = other_args.strip(' -\\').replace(" ", "").replace("\"", "")

if other_args:
other_args = "_" + other_args

if test_config["deepspeed"]:
file_name = "_gpu{0}_{1}_ds{2}-{3}.log".format(test_config["gpus"],
other_args,
zero_args,
self.timestr)
save_dir = self.test_dir
else:
file_name = "_gpu{0}_{1}.log".format(test_config["gpus"], other_args)
save_dir = self.baseline_dir

return os.path.join(save_dir, prefix + file_name)

def ensure_directory_exists(self, filename):
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)

def clean_test_env(self):
cmd = "dlts_ssh pkill -9 -f /usr/bin/python"
print(cmd)
subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
time.sleep(20)

def run_BingBertSquad_test(self, test_config, output):
ds_flag = " -d --deepspeed_config " + test_config["json"] if test_config[
"deepspeed"] else " "
other_args = " " + test_config[
"other_args"] if "other_args" in test_config else " "

cmd = "./run_BingBertSquad_sanity.sh -e 1 -g {0} {1} {2}".format(
test_config["gpus"],
other_args,
ds_flag)

self.ensure_directory_exists(output)
with open(output, "w") as f:
print(cmd)
subprocess.run(cmd,
shell=True,
check=False,
executable='/bin/bash',
stdout=f,
stderr=f)
5 changes: 5 additions & 0 deletions tests/model/BingBertSquad/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.

from .BingBertSquad_run_func_test import BingBertSquadFuncTestCase
from .BingBertSquad_run_func_test import suite
25 changes: 25 additions & 0 deletions tests/model/BingBertSquad/deepspeed_bsz24_fp16_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"tensorboard": {
"enabled": false,
"job_name": "MyJob"
},
"zero_optimization": true,
"disable_allgather": false,
"allgather_size": 200000,
"wall_clock_breakdown": false,
"train_batch_size": 24,
"train_micro_batch_size_per_gpu": 3,
"steps_per_print": 1,
"optimizer": {
"type": "Adam",
"params": {
"lr": 3e-5,
"max_grad_norm": 1.0,
"weight_decay": 0.0,
"bias_correction": false
}
},
"fp16": {
"enabled": true
}
}
17 changes: 17 additions & 0 deletions tests/model/BingBertSquad/deepspeed_bsz24_fp32_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"train_batch_size": 24,
"train_micro_batch_size_per_gpu": 3,
"steps_per_print": 1,
"optimizer": {
"type": "Adam",
"params": {
"lr": 3e-5,
"max_grad_norm": 1.0,
"weight_decay": 0.0,
"bias_correction": false
}
},
"fp16": {
"enabled": false
}
}
Loading