Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
45eac22
Partial updating of ROCM reduction code.
jessebenson Jan 13, 2021
a4624f6
Update reduction_all.cu
jessebenson Jan 13, 2021
441ac35
Add reduce template parameters.
jessebenson Jan 14, 2021
3424767
miopen common
jessebenson Jan 14, 2021
75894ec
Reuse CUDA's reduction_functions.cc
jessebenson Jan 15, 2021
01b48e2
Reduction ops.
jessebenson Jan 15, 2021
172016d
Update remaining reduction ops to use MIOpen. double datatype is not…
jessebenson Jan 15, 2021
1f29c56
Disable a couple more unsupported tests.
jessebenson Jan 15, 2021
ddd1017
Code formatting.
jessebenson Jan 15, 2021
30e5ea2
Delete ROCM-specific reduction code that is identical to CUDA reducti…
jessebenson Jan 19, 2021
74bea5a
Fix scratch buffer early free.
jessebenson Jan 21, 2021
32d02ab
Fix merge conflict.
jessebenson Jan 26, 2021
06c7b71
first attempt nightly amd ci pipeline
Feb 1, 2021
241d488
try fix bad yaml file
Feb 1, 2021
5462356
try again with corrected model directory
Feb 1, 2021
4a725b1
add convergence test as well
Feb 1, 2021
8d9b000
update reference loss for amd mi100
Feb 1, 2021
04e0f8d
include mi100 test results csv
Feb 1, 2021
da9b78d
merge jesseb/rocm-reduction to enable deterministic compute
Feb 1, 2021
a0bf453
update the mi100 convergence test reference values
Feb 1, 2021
843ad79
update batch sizes for mi100 32g
Feb 2, 2021
730ba21
fix gpu sku for run_convergence_test.py
Feb 2, 2021
9a252db
merge wiht master
Feb 8, 2021
900269a
undo unrelated changes to master
Feb 8, 2021
3dfabe7
pr comments
Feb 12, 2021
d68462b
pr comment
Feb 12, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions orttraining/orttraining/models/bert/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,29 @@ using namespace onnxruntime::training;
using namespace onnxruntime::training::tensorboard;
using namespace std;

static SessionOptions session_options = {
ExecutionMode::ORT_SEQUENTIAL, //execution_mode
ExecutionOrder::PRIORITY_BASED, //execution_order
false, //enable_profiling
ORT_TSTR(""), //optimized_model_filepath
true, //enable_mem_pattern
true, //enable_cpu_mem_arena
ORT_TSTR("onnxruntime_profile_"), //profile_file_prefix
"", //session_logid
-1, //session_log_severity_level
0, //session_log_verbosity_level
5, //max_num_graph_transformation_steps
TransformerLevel::Level1, //graph_optimization_level
{}, //intra_op_param
{}, //inter_op_param
{}, //free_dimension_overrides
true, //use_per_session_threads
true, //thread_pool_allow_spinning
false, //use_deterministic_compute
{}, //session_configurations
{}, // initializers_to_share_map
};

struct BertParameters : public TrainingRunner::Parameters {
int max_sequence_length = 512;
int max_predictions_per_sequence = 80;
Expand Down Expand Up @@ -109,6 +132,7 @@ Status ParseArguments(int argc, char* argv[], BertParameters& params, OrtParamet
("iterations_per_loop", "How many steps to make in each estimator call.", cxxopts::value<int>()->default_value("1000"))
("max_eval_steps", "Maximum number of eval steps.", cxxopts::value<int>()->default_value("100"))
("seed", "Random seed.", cxxopts::value<int64_t>()->default_value("-1"))
("use_deterministic_compute", "Whether to enable deterministic compute.", cxxopts::value<bool>()->default_value("false"))
("use_mixed_precision", "Whether to use a mix of fp32 and fp16 arithmetic on GPU.", cxxopts::value<bool>()->default_value("false"))
("use_bfloat16", "Whether to use BFloat16 arithmetic on GPU.", cxxopts::value<bool>()->default_value("false"))
("enable_adasum", "Whether to use Adasum for allreduction.", cxxopts::value<bool>()->default_value("false"))
Expand Down Expand Up @@ -469,6 +493,8 @@ Status ParseArguments(int argc, char* argv[], BertParameters& params, OrtParamet
std::cout << "Random seed is set to: " << seed << std::endl;
}

session_options.use_deterministic_compute = flags["use_deterministic_compute"].as<bool>();

params.enable_gelu_approximation = flags["enable_gelu_approximation"].as<bool>();
params.attn_dropout_recompute = flags["attn_dropout_recompute"].as<bool>();
params.gelu_recompute = flags["gelu_recompute"].as<bool>();
Expand Down Expand Up @@ -746,7 +772,7 @@ static Status RunPerformanceTest(const BertParameters& params, const Environment
auto random_perf_data = std::make_shared<RandomDataSet>(num_of_perf_samples, tensor_names, tensor_shapes, tensor_types);
auto random_perf_data_loader = onnxruntime::make_unique<SingleDataLoader>(random_perf_data, tensor_names);

TrainingRunner runner{params, env};
TrainingRunner runner{params, env, session_options};
ORT_RETURN_IF_ERROR(runner.Initialize());
ORT_RETURN_IF_ERROR(runner.Run(random_perf_data_loader.get(), random_perf_data_loader.get()));

Expand All @@ -756,7 +782,7 @@ static Status RunPerformanceTest(const BertParameters& params, const Environment
static Status RunTraining(const BertParameters& params, const Environment& env) {
const size_t max_num_files_preload = 2;

auto runner = onnxruntime::make_unique<TrainingRunner>(params, env);
auto runner = onnxruntime::make_unique<TrainingRunner>(params, env, session_options);
ORT_RETURN_IF_ERROR(runner->Initialize());

BertParameters params_for_phase;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
step,total_loss,mlm_loss,nsp_loss
0,11.217,10.5178,0.699256
5,9.67644,7.52047,2.15598
10,8.31964,7.54136,0.778281
15,8.22823,7.54625,0.681978
20,8.17299,7.49675,0.676236
25,8.2415,7.5356,0.705902
30,8.0874,7.39312,0.694279
35,7.99095,7.25612,0.734829
40,7.92988,7.25608,0.673804
45,7.94762,7.27291,0.674713
15 changes: 13 additions & 2 deletions orttraining/tools/ci_test/run_batch_size_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ def parse_args():
parser = argparse.ArgumentParser(description="Runs a BERT batch size test.")
parser.add_argument("--binary_dir", required=True, help="Path to the ORT binary directory.")
parser.add_argument("--model_root", required=True, help="Path to the model root directory.")
parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False,
help="GPU model (e.g. V100_16G, MI100_32G).")
return parser.parse_args()


Expand All @@ -24,7 +26,9 @@ def main():
"max_batch_size",
"max_predictions_per_seq",
"additional_options"])
configs = [

configs = {}
configs['V100_16G'] = [
Config(True, 128, 76, 20, ""),
Config(True, 512, 11, 80, ""),
Config(False, 128, 39, 20, ""),
Expand All @@ -41,8 +45,15 @@ def main():
Config(True, 512, 50, 80, "--transformer_layer_recompute"),
]

configs['MI100_32G'] = [
Config(True, 128, 201, 20, ""),
Config(True, 512, 31, 80, ""),
Config(False, 128, 109, 20, ""),
Config(False, 512, 16, 80, ""),
]

# run BERT training
for config in configs:
for config in configs[args.gpu_sku]:
print("##### testing name - {}-{} #####".format("fp16" if config.enable_mixed_precision else "fp32",
config.sequence_length))
cmds = [
Expand Down
13 changes: 12 additions & 1 deletion orttraining/tools/ci_test/run_convergence_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ def parse_args():
help="Path to the training data root directory.")
parser.add_argument("--model_root", required=True,
help="Path to the model root directory.")
parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False,
help="GPU model (e.g. V100_16G, MI100_32G).")
return parser.parse_args()

def main():
Expand Down Expand Up @@ -49,6 +51,7 @@ def main():
"--gradient_accumulation_steps", "16",
"--max_predictions_per_seq=20",
"--use_mixed_precision",
"--use_deterministic_compute",
"--allreduce_in_fp16",
"--lambda", "0",
"--use_nccl",
Expand All @@ -57,10 +60,18 @@ def main():
"--enable_grad_norm_clip=false",
]).check_returncode()

# reference data
if args.gpu_sku == 'MI100_32G':
reference_csv = "bert_base.convergence.baseline.mi100.csv"
elif args.gpu_sku == 'V100_16G':
reference_csv = "bert_base.convergence.baseline.csv"
else:
raise ValueError('Unrecognized gpu_sku {}'.format(args.gpu_sku))

# verify output
comparison_result = compare_results_files(
expected_results_path=os.path.join(
SCRIPT_DIR, "results", "bert_base.convergence.baseline.csv"),
SCRIPT_DIR, "results", reference_csv),
actual_results_path=convergence_test_output_path,
field_comparisons={
"step": Comparisons.eq(),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
trigger: none

name: 'orttraining_amd_nightly_$(Date:yyyyMMdd)_$(Rev:r)'
pool: 'AMD-GPU'

jobs:
- job: Onnxruntime_Linux_GPU_AMD_Training_E2E_Test

timeoutInMinutes: 60

steps:
- checkout: self
clean: true
submodules: recursive

- script: |-
echo "##vso[task.prependpath]/home/ciagent/conda/bin/"
echo "##vso[task.prependpath]/home/ciagent/pkg/openmpi-4.0.5/bin/"
echo '##vso[task.setvariable variable=LD_LIBRARY_PATH]/home/ciagent/pkg/openmpi-4.0.5/lib/'
eval "$('/home/ciagent/conda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
echo "Selecting GPU based on HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES"
displayName: 'Initialize environment'

# update these if the E2E test data changes
- script: |-
python orttraining/tools/ci_test/download_azure_blob_archive.py \
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
--target_dir training_e2e_test_data \
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
displayName: 'Download onnxruntime_training_data.zip data'

- script: |-
python tools/ci_build/build.py \
--config RelWithDebInfo \
--enable_training \
--mpi_home /home/ciagent/pkg/openmpi-4.0.5 \
--use_rocm \
--rocm_home /opt/rocm \
--nccl_home /opt/rocm \
--update \
--build_dir ./build \
--build \
--parallel 8 \
--build_wheel \
--skip_tests
displayName: 'Build onnxruntime'

- script: |-
cd ./build/RelWithDebInfo &&\
../../tools/ci_build/github/pai/pai_test_launcher.sh
displayName: 'Run unit tests'

- script: |-
python orttraining/tools/ci_test/run_batch_size_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--gpu_sku MI100_32G
displayName: 'Run batch size test'
condition: succeededOrFailed() # ensure all tests are run

- script: |-
python orttraining/tools/ci_test/run_convergence_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--training_data_root training_e2e_test_data/data \
--gpu_sku MI100_32G
displayName: 'Run convergence test'
condition: succeededOrFailed() # ensure all tests are run