diff --git a/orttraining/orttraining/models/bert/main.cc b/orttraining/orttraining/models/bert/main.cc index 26a3ff806937f..49d43554d1000 100644 --- a/orttraining/orttraining/models/bert/main.cc +++ b/orttraining/orttraining/models/bert/main.cc @@ -33,6 +33,29 @@ using namespace onnxruntime::training; using namespace onnxruntime::training::tensorboard; using namespace std; +static SessionOptions session_options = { + ExecutionMode::ORT_SEQUENTIAL, //execution_mode + ExecutionOrder::PRIORITY_BASED, //execution_order + false, //enable_profiling + ORT_TSTR(""), //optimized_model_filepath + true, //enable_mem_pattern + true, //enable_cpu_mem_arena + ORT_TSTR("onnxruntime_profile_"), //profile_file_prefix + "", //session_logid + -1, //session_log_severity_level + 0, //session_log_verbosity_level + 5, //max_num_graph_transformation_steps + TransformerLevel::Level1, //graph_optimization_level + {}, //intra_op_param + {}, //inter_op_param + {}, //free_dimension_overrides + true, //use_per_session_threads + true, //thread_pool_allow_spinning + false, //use_deterministic_compute + {}, //session_configurations + {}, // initializers_to_share_map +}; + struct BertParameters : public TrainingRunner::Parameters { int max_sequence_length = 512; int max_predictions_per_sequence = 80; @@ -109,6 +132,7 @@ Status ParseArguments(int argc, char* argv[], BertParameters& params, OrtParamet ("iterations_per_loop", "How many steps to make in each estimator call.", cxxopts::value()->default_value("1000")) ("max_eval_steps", "Maximum number of eval steps.", cxxopts::value()->default_value("100")) ("seed", "Random seed.", cxxopts::value()->default_value("-1")) + ("use_deterministic_compute", "Whether to enable deterministic compute.", cxxopts::value()->default_value("false")) ("use_mixed_precision", "Whether to use a mix of fp32 and fp16 arithmetic on GPU.", cxxopts::value()->default_value("false")) ("use_bfloat16", "Whether to use BFloat16 arithmetic on GPU.", cxxopts::value()->default_value("false")) ("enable_adasum", "Whether to use Adasum for allreduction.", cxxopts::value()->default_value("false")) @@ -469,6 +493,8 @@ Status ParseArguments(int argc, char* argv[], BertParameters& params, OrtParamet std::cout << "Random seed is set to: " << seed << std::endl; } + session_options.use_deterministic_compute = flags["use_deterministic_compute"].as(); + params.enable_gelu_approximation = flags["enable_gelu_approximation"].as(); params.attn_dropout_recompute = flags["attn_dropout_recompute"].as(); params.gelu_recompute = flags["gelu_recompute"].as(); @@ -746,7 +772,7 @@ static Status RunPerformanceTest(const BertParameters& params, const Environment auto random_perf_data = std::make_shared(num_of_perf_samples, tensor_names, tensor_shapes, tensor_types); auto random_perf_data_loader = onnxruntime::make_unique(random_perf_data, tensor_names); - TrainingRunner runner{params, env}; + TrainingRunner runner{params, env, session_options}; ORT_RETURN_IF_ERROR(runner.Initialize()); ORT_RETURN_IF_ERROR(runner.Run(random_perf_data_loader.get(), random_perf_data_loader.get())); @@ -756,7 +782,7 @@ static Status RunPerformanceTest(const BertParameters& params, const Environment static Status RunTraining(const BertParameters& params, const Environment& env) { const size_t max_num_files_preload = 2; - auto runner = onnxruntime::make_unique(params, env); + auto runner = onnxruntime::make_unique(params, env, session_options); ORT_RETURN_IF_ERROR(runner->Initialize()); BertParameters params_for_phase; diff --git a/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv new file mode 100644 index 0000000000000..0bdb749881000 --- /dev/null +++ b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv @@ -0,0 +1,11 @@ +step,total_loss,mlm_loss,nsp_loss +0,11.217,10.5178,0.699256 +5,9.67644,7.52047,2.15598 +10,8.31964,7.54136,0.778281 +15,8.22823,7.54625,0.681978 +20,8.17299,7.49675,0.676236 +25,8.2415,7.5356,0.705902 +30,8.0874,7.39312,0.694279 +35,7.99095,7.25612,0.734829 +40,7.92988,7.25608,0.673804 +45,7.94762,7.27291,0.674713 diff --git a/orttraining/tools/ci_test/run_batch_size_test.py b/orttraining/tools/ci_test/run_batch_size_test.py index fa5949d568518..046971048185b 100755 --- a/orttraining/tools/ci_test/run_batch_size_test.py +++ b/orttraining/tools/ci_test/run_batch_size_test.py @@ -13,6 +13,8 @@ def parse_args(): parser = argparse.ArgumentParser(description="Runs a BERT batch size test.") parser.add_argument("--binary_dir", required=True, help="Path to the ORT binary directory.") parser.add_argument("--model_root", required=True, help="Path to the model root directory.") + parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False, + help="GPU model (e.g. V100_16G, MI100_32G).") return parser.parse_args() @@ -24,7 +26,9 @@ def main(): "max_batch_size", "max_predictions_per_seq", "additional_options"]) - configs = [ + + configs = {} + configs['V100_16G'] = [ Config(True, 128, 76, 20, ""), Config(True, 512, 11, 80, ""), Config(False, 128, 39, 20, ""), @@ -41,8 +45,15 @@ def main(): Config(True, 512, 50, 80, "--transformer_layer_recompute"), ] + configs['MI100_32G'] = [ + Config(True, 128, 201, 20, ""), + Config(True, 512, 31, 80, ""), + Config(False, 128, 109, 20, ""), + Config(False, 512, 16, 80, ""), + ] + # run BERT training - for config in configs: + for config in configs[args.gpu_sku]: print("##### testing name - {}-{} #####".format("fp16" if config.enable_mixed_precision else "fp32", config.sequence_length)) cmds = [ diff --git a/orttraining/tools/ci_test/run_convergence_test.py b/orttraining/tools/ci_test/run_convergence_test.py index 68528e2897ca6..abaacb5734e65 100755 --- a/orttraining/tools/ci_test/run_convergence_test.py +++ b/orttraining/tools/ci_test/run_convergence_test.py @@ -20,6 +20,8 @@ def parse_args(): help="Path to the training data root directory.") parser.add_argument("--model_root", required=True, help="Path to the model root directory.") + parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False, + help="GPU model (e.g. V100_16G, MI100_32G).") return parser.parse_args() def main(): @@ -49,6 +51,7 @@ def main(): "--gradient_accumulation_steps", "16", "--max_predictions_per_seq=20", "--use_mixed_precision", + "--use_deterministic_compute", "--allreduce_in_fp16", "--lambda", "0", "--use_nccl", @@ -57,10 +60,18 @@ def main(): "--enable_grad_norm_clip=false", ]).check_returncode() + # reference data + if args.gpu_sku == 'MI100_32G': + reference_csv = "bert_base.convergence.baseline.mi100.csv" + elif args.gpu_sku == 'V100_16G': + reference_csv = "bert_base.convergence.baseline.csv" + else: + raise ValueError('Unrecognized gpu_sku {}'.format(args.gpu_sku)) + # verify output comparison_result = compare_results_files( expected_results_path=os.path.join( - SCRIPT_DIR, "results", "bert_base.convergence.baseline.csv"), + SCRIPT_DIR, "results", reference_csv), actual_results_path=convergence_test_output_path, field_comparisons={ "step": Comparisons.eq(), diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-amd-e2e-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-amd-e2e-test-ci-pipeline.yml new file mode 100644 index 0000000000000..c16ae5971faed --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-amd-e2e-test-ci-pipeline.yml @@ -0,0 +1,68 @@ +trigger: none + +name: 'orttraining_amd_nightly_$(Date:yyyyMMdd)_$(Rev:r)' +pool: 'AMD-GPU' + +jobs: +- job: Onnxruntime_Linux_GPU_AMD_Training_E2E_Test + + timeoutInMinutes: 60 + + steps: + - checkout: self + clean: true + submodules: recursive + + - script: |- + echo "##vso[task.prependpath]/home/ciagent/conda/bin/" + echo "##vso[task.prependpath]/home/ciagent/pkg/openmpi-4.0.5/bin/" + echo '##vso[task.setvariable variable=LD_LIBRARY_PATH]/home/ciagent/pkg/openmpi-4.0.5/lib/' + eval "$('/home/ciagent/conda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" + echo "Selecting GPU based on HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES" + displayName: 'Initialize environment' + + # update these if the E2E test data changes + - script: |- + python orttraining/tools/ci_test/download_azure_blob_archive.py \ + --azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \ + --target_dir training_e2e_test_data \ + --archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9 + displayName: 'Download onnxruntime_training_data.zip data' + + - script: |- + python tools/ci_build/build.py \ + --config RelWithDebInfo \ + --enable_training \ + --mpi_home /home/ciagent/pkg/openmpi-4.0.5 \ + --use_rocm \ + --rocm_home /opt/rocm \ + --nccl_home /opt/rocm \ + --update \ + --build_dir ./build \ + --build \ + --parallel 8 \ + --build_wheel \ + --skip_tests + displayName: 'Build onnxruntime' + + - script: |- + cd ./build/RelWithDebInfo &&\ + ../../tools/ci_build/github/pai/pai_test_launcher.sh + displayName: 'Run unit tests' + + - script: |- + python orttraining/tools/ci_test/run_batch_size_test.py \ + --binary_dir build/RelWithDebInfo \ + --model_root training_e2e_test_data/models \ + --gpu_sku MI100_32G + displayName: 'Run batch size test' + condition: succeededOrFailed() # ensure all tests are run + + - script: |- + python orttraining/tools/ci_test/run_convergence_test.py \ + --binary_dir build/RelWithDebInfo \ + --model_root training_e2e_test_data/models \ + --training_data_root training_e2e_test_data/data \ + --gpu_sku MI100_32G + displayName: 'Run convergence test' + condition: succeededOrFailed() # ensure all tests are run