From d1bd5ab8da3c28153f0366eb93cae70224acced4 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sun, 26 Dec 2021 17:33:25 -0800 Subject: [PATCH 01/10] [benchmark tool] trainer-benchmark.py --- scripts/benchmark/trainer-benchmark.py | 236 +++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100755 scripts/benchmark/trainer-benchmark.py diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py new file mode 100755 index 000000000000..06680b4a5df1 --- /dev/null +++ b/scripts/benchmark/trainer-benchmark.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python + +# HF Trainer benchmarking tool +# +# This tool can be used to run and compare multiple dimensions of the HF Trainers args +# +# The main idea is: +# ./trainer-benchmark.py --base-cmd '' \ +# --dims '--tf32 0 --tf32 1' '--fp16 0 --fp16 1 --bf16 1' \ +# --metric-key train_samples_per_second +# +# --dims allows you to compare multiple dimensions. +# +# as the first dimension has 2 options and the second 3, this will run the trainer 6 times adding +# one of: +# +# --tf32 0 --fp16 0 +# --tf32 0 --fp16 1 +# --tf32 0 --bf16 1 +# --tf32 1 --fp16 0 +# --tf32 1 --fp16 1 +# --tf32 1 --bf16 1 +# +# and print the results. This is just a cartesian product - and more than 2 dimensions can be used. +# +# Here is a full example of a train: +# +# CUDA_VISIBLE_DEVICES=0 ./scripts/benchmark/trainer-benchmark.py \ +# --base-cmd ' \ +# examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --output_dir output_dir \ +# --do_train --label_smoothing 0.1 --logging_strategy no --save_strategy no --per_device_train_batch_size 8 \ +# --max_source_length 512 --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \ +# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \ +# --source_prefix "translate English to Romanian: " --warmup_steps 50 \ +# --max_train_samples 5000 --dataloader_num_workers 2 \ +# ' \ +# --dims '--tf32 0 --tf32 1' '--fp16 0 --fp16 1 --bf16 1' \ +# --base-dim '--tf32 0 --fp16 0' \ +# --metric-key train_samples_per_second --repeat-times 1 +# +# and here a possible output: +# +# *** Results: train_samples_per_second +# +# | Variations | Result | % | +# | ----------------- | ------ | ----- | +# | --tf32 0 --fp16 0 | 31.95 | 100% | +# | --tf32 0 --fp16 1 | 47.88 | 149% | +# | --tf32 0 --bf16 1 | 35.04 | 109% | +# | --tf32 1 --fp16 0 | 35.47 | 111% | +# | --tf32 1 --fp16 1 | 47.82 | 149% | +# | --tf32 1 --bf16 1 | 35.11 | 109% | +# +# So you can quickly compare the different outcomes. +# +# Typically running each experiment once is enough, but if the environment is unstable you can +# re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the average results. +# +# by default it'll use the worst result as the base line to use as 100% and then compare the rest to +# it as can be seen from the table, but you can also specify which combination is the one to use as +# the baseline, e.g., to change to another entry use: --base-dim '--tf32 1 --fp16 0' +# +# --metric-key is there to tell the program which metrics to compare - the different metric keys are +# inside output_dir/all_results.json. e.g., to measure eval performance instead of train use +# --metric-key eval_samples_per_second + + +import argparse +import io +import itertools +import json +import re +import shlex +import subprocess +import sys +from statistics import fmean + +from tqdm import tqdm + + +def get_base_cmd(args, output_dir): + + # unwrap multi-line input + args.base_cmd = re.sub(r"\\", " ", args.base_cmd) + args.base_cmd = re.sub(r"\n", " ", args.base_cmd) + + # remove --output_dir if any and set our own + args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd) + args.base_cmd += f"--output_dir {output_dir} " + + # ensure we have --overwrite_output_dir + args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd) + args.base_cmd += "--overwrite_output_dir " + + return [sys.executable] + shlex.split(args.base_cmd) + + +def process_run(id, cmd, opt, repeat_times, output_dir, metric_key, verbose): + results = [] + preamble = f"{id}: {opt}" + outcome = f"{preamble}: " + for i in tqdm(range(repeat_times), desc=preamble, leave=False): + result = process_run_single(id, cmd, opt, output_dir, metric_key, verbose) + if result != -1: + results.append(result) + outcome += "✓" + else: + outcome += "✘" + outcome = f"\33[2K\r{outcome}" + if len(results): + mean_result = round(fmean(results), 2) + results_str = f"{outcome} {mean_result}" + if len(results) > 1: + results_str += f" ({[round(x, 2) for x in results]})" + print(results_str) + return mean_result + else: + print(outcome) + return -1 + + +def process_run_single(id, cmd, opt, output_dir, metric_key, verbose): + # enable to debug everything but the run itself, to do it fast and see the progress + # from random import randint + # from time import sleep + # sleep(3) + # return randint(100, 300) + + result = subprocess.run(cmd, capture_output=True, text=True) + + if verbose: + print("STDOUT", result.stdout) + print("STDERR", result.stderr) + + if result.returncode != 0: + if verbose: + print("failed") + return -1 + + filename = f"{output_dir}/all_results.json" + with io.open(filename, "r", encoding="utf-8") as f: + metrics = json.load(f) + return metrics[metric_key] + + +def process_results(results, metric_key, base_dim): + + print(f"\n*** Results: {metric_key}\n") + + col_opt, col_result, col_relative = "Variations", "Result", "%" + width_opt = max(len(k) for k in list(results.keys()) + [col_opt]) + width_metric = max(len(str(v)) for v in list(results.values()) + [col_result]) + width_percent = 5 + + if base_dim is not None and base_dim in results: + sentinel_value = results[base_dim] + else: + # if no match, use the minimal value as the sentinel + sentinel_value = min(v for v in results.values() if v != -1) + + print(f"| {col_opt:^{width_opt}} | {col_result:^{width_metric}} | {col_relative:^{width_percent}} |") + print(f"| {'-'*width_opt:{width_opt}} | {'-'*width_metric:{width_metric}} | {'-'*width_percent:{width_percent}} |") + for key, value in results.items(): + if value != -1: + percent = f"{int(100*value/sentinel_value)}%" + value = f"{value:.02f}" + else: + percent = "✘" + value = "✘" + print(f"| {key:{width_opt}} | {value:>{width_metric}} | {percent:>{width_percent}} |") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--base-cmd", + default=None, + type=str, + required=True, + help="Base cmd", + ) + parser.add_argument( + "--dims", + default=None, + type=str, + nargs="+", + required=True, + help="Dimension args", + ) + parser.add_argument( + "--base-dim", + default=None, + type=str, + help="Dimension base line arg. if None the minimal value will be used to compare against", + ) + parser.add_argument( + "--metric-key", + default=None, + type=str, + required=True, + help="Metric key in output_dir/all_results.json, e.g., train_samples_per_second", + ) + parser.add_argument( + "--repeat-times", + default=1, + type=int, + help="How many times to re-run each combination - an average will be reported", + ) + parser.add_argument( + "--verbose", + default=False, + action="store_true", + help="Whether to show the outputs of each run or just the benchmark progress", + ) + args = parser.parse_args() + + output_dir = "output_benchmark" + base_cmd = get_base_cmd(args, output_dir) + + results = {} + dims = [list(map(str.strip, re.split(r"(?=--)", x)[1:])) for x in args.dims] + # cartesian product of dimensions and then converted back into cmd-line arg strings + opts = list(map(" ".join, itertools.product(*dims))) + + print(f"\n*** Running {len(opts)} benchmarks:") + print(f"Base command: {' '.join(base_cmd)}") + + for id, opt in enumerate(tqdm(opts, desc="Total completion: ", leave=False)): + cmd = base_cmd + opt.split() + results[opt] = process_run(id + 1, cmd, opt, args.repeat_times, output_dir, args.metric_key, args.verbose) + + process_results(results, args.metric_key, args.base_dim) + + +if __name__ == "__main__": + main() From d1c9ad2cf169ea08ffdeb98d7314bf69febaddb7 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 27 Dec 2021 11:36:55 -0800 Subject: [PATCH 02/10] improve --- scripts/benchmark/trainer-benchmark.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py index 06680b4a5df1..c325c3f5789a 100755 --- a/scripts/benchmark/trainer-benchmark.py +++ b/scripts/benchmark/trainer-benchmark.py @@ -38,7 +38,7 @@ # --base-dim '--tf32 0 --fp16 0' \ # --metric-key train_samples_per_second --repeat-times 1 # -# and here a possible output: +# and here is a possible output: # # *** Results: train_samples_per_second # @@ -54,10 +54,10 @@ # So you can quickly compare the different outcomes. # # Typically running each experiment once is enough, but if the environment is unstable you can -# re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the average results. +# re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the averaged results. # -# by default it'll use the worst result as the base line to use as 100% and then compare the rest to -# it as can be seen from the table, but you can also specify which combination is the one to use as +# By default it'll use the worst result as the base line to use as 100% and then compare the rest to +# it as can be seen from the table above, but you can also specify which combination is the one to use as # the baseline, e.g., to change to another entry use: --base-dim '--tf32 1 --fp16 0' # # --metric-key is there to tell the program which metrics to compare - the different metric keys are @@ -81,16 +81,15 @@ def get_base_cmd(args, output_dir): # unwrap multi-line input - args.base_cmd = re.sub(r"\\", " ", args.base_cmd) - args.base_cmd = re.sub(r"\n", " ", args.base_cmd) + args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd) # remove --output_dir if any and set our own args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd) - args.base_cmd += f"--output_dir {output_dir} " + args.base_cmd += f" --output_dir {output_dir}" # ensure we have --overwrite_output_dir args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd) - args.base_cmd += "--overwrite_output_dir " + args.base_cmd += " --overwrite_output_dir" return [sys.executable] + shlex.split(args.base_cmd) @@ -217,14 +216,15 @@ def main(): output_dir = "output_benchmark" base_cmd = get_base_cmd(args, output_dir) - results = {} + # split each dimension into its --foo variations dims = [list(map(str.strip, re.split(r"(?=--)", x)[1:])) for x in args.dims] - # cartesian product of dimensions and then converted back into cmd-line arg strings + # build a cartesian product of dimensions and convert those back into cmd-line arg strings opts = list(map(" ".join, itertools.product(*dims))) print(f"\n*** Running {len(opts)} benchmarks:") print(f"Base command: {' '.join(base_cmd)}") + results = {} for id, opt in enumerate(tqdm(opts, desc="Total completion: ", leave=False)): cmd = base_cmd + opt.split() results[opt] = process_run(id + 1, cmd, opt, args.repeat_times, output_dir, args.metric_key, args.verbose) From 51929f047371cb0e7e75c1dca9106ee1e737414d Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 27 Dec 2021 22:39:05 -0800 Subject: [PATCH 03/10] massive rework/expansion --- scripts/benchmark/trainer-benchmark.py | 208 +++++++++++++++++-------- 1 file changed, 146 insertions(+), 62 deletions(-) diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py index c325c3f5789a..1cb0776b4880 100755 --- a/scripts/benchmark/trainer-benchmark.py +++ b/scripts/benchmark/trainer-benchmark.py @@ -7,7 +7,7 @@ # The main idea is: # ./trainer-benchmark.py --base-cmd '' \ # --dims '--tf32 0 --tf32 1' '--fp16 0 --fp16 1 --bf16 1' \ -# --metric-key train_samples_per_second +# --target-metric-key train_samples_per_second # # --dims allows you to compare multiple dimensions. # @@ -36,7 +36,7 @@ # ' \ # --dims '--tf32 0 --tf32 1' '--fp16 0 --fp16 1 --bf16 1' \ # --base-dim '--tf32 0 --fp16 0' \ -# --metric-key train_samples_per_second --repeat-times 1 +# --target-metric-key train_samples_per_second --repeat-times 1 # # and here is a possible output: # @@ -56,13 +56,13 @@ # Typically running each experiment once is enough, but if the environment is unstable you can # re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the averaged results. # -# By default it'll use the worst result as the base line to use as 100% and then compare the rest to +# By default it'll use the lowest result as the base line to use as 100% and then compare the rest to # it as can be seen from the table above, but you can also specify which combination is the one to use as # the baseline, e.g., to change to another entry use: --base-dim '--tf32 1 --fp16 0' # -# --metric-key is there to tell the program which metrics to compare - the different metric keys are +# --target-metric-key is there to tell the program which metrics to compare - the different metric keys are # inside output_dir/all_results.json. e.g., to measure eval performance instead of train use -# --metric-key eval_samples_per_second +# --target-metric-key eval_samples_per_second import argparse @@ -73,8 +73,10 @@ import shlex import subprocess import sys +from pathlib import Path from statistics import fmean +import pandas as pd from tqdm import tqdm @@ -94,79 +96,131 @@ def get_base_cmd(args, output_dir): return [sys.executable] + shlex.split(args.base_cmd) -def process_run(id, cmd, opt, repeat_times, output_dir, metric_key, verbose): - results = [] +def process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, verbose): + # enable to debug everything but the run itself, to do it fast and see the progress + if 1: + import random + from random import randint + from time import sleep + + sleep(0) + return dict( + {k: randint(1, 30) for k in metric_keys}, **{target_metric_key: random.choice([-1, 10, 100, 55, 222])} + ) + + result = subprocess.run(cmd, capture_output=True, text=True) + + if verbose: + print("STDOUT", result.stdout) + print("STDERR", result.stderr) + + # save the streams + prefix = opt.replace(" ", "-") + with open(Path(output_dir) / f"{prefix}.stdout.txt", "w") as f: + f.write(result.stdout) + with open(Path(output_dir) / f"{prefix}.stderr.txt", "w") as f: + f.write(result.stderr) + + if result.returncode != 0: + if verbose: + print("failed") + return {target_metric_key: -1} + + with io.open(f"{output_dir}/all_results.json", "r", encoding="utf-8") as f: + metrics = json.load(f) + + # filter out just the keys we want + return {k: v for k, v in metrics.items() if k in metric_keys} + + +def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, repeat_times, output_dir, verbose): + metrics = [] preamble = f"{id}: {opt}" outcome = f"{preamble}: " + metric_keys = set(report_metric_keys + [target_metric_key]) for i in tqdm(range(repeat_times), desc=preamble, leave=False): - result = process_run_single(id, cmd, opt, output_dir, metric_key, verbose) + single_run_metrics = process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, verbose) + result = single_run_metrics[target_metric_key] if result != -1: - results.append(result) + metrics.append(single_run_metrics) outcome += "✓" else: outcome += "✘" outcome = f"\33[2K\r{outcome}" - if len(results): - mean_result = round(fmean(results), 2) - results_str = f"{outcome} {mean_result}" - if len(results) > 1: - results_str += f" ({[round(x, 2) for x in results]})" + successful_runs = len(metrics) + if successful_runs > 0: + mean_metrics = {k: fmean([metrics[i][k] for i in range(successful_runs)]) for k in metrics[0].keys()} + mean_target = round(mean_metrics[target_metric_key], 2) + results_str = f"{outcome} {mean_target}" + if successful_runs > 1: + results_str += f" ({[round(x, 2) for x in mean_metrics[target_metric_key]]})" print(results_str) - return mean_result + mean_metrics[opt_key] = opt + return mean_metrics else: print(outcome) - return -1 + return {opt_key: opt, target_metric_key: -1} -def process_run_single(id, cmd, opt, output_dir, metric_key, verbose): - # enable to debug everything but the run itself, to do it fast and see the progress - # from random import randint - # from time import sleep - # sleep(3) - # return randint(100, 300) +def get_versions(): + import datetime - result = subprocess.run(cmd, capture_output=True, text=True) + import torch - if verbose: - print("STDOUT", result.stdout) - print("STDERR", result.stderr) + import transformers - if result.returncode != 0: - if verbose: - print("failed") - return -1 + properties = torch.cuda.get_device_properties(torch.device("cuda")) + return f""" +Datetime : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} +transformers: {transformers.__version__} +torch : {torch.__version__} +cuda : {torch.version.cuda} +{torch.cuda.device_count()} GPUs : {properties.name}, {properties.total_memory/2**30:0.2f}GB +""" - filename = f"{output_dir}/all_results.json" - with io.open(filename, "r", encoding="utf-8") as f: - metrics = json.load(f) - return metrics[metric_key] +def process_results(results, target_metric_key, report_metric_keys, base_dim, table_format, output_dir): -def process_results(results, metric_key, base_dim): + df = pd.DataFrame(results) + variation_key = "variation" + diff_key = "diff_%" - print(f"\n*** Results: {metric_key}\n") + sentinel_value = -1 + if base_dim is not None and len(df[df.variation == base_dim]): + # this may still return -1 + sentinel_value = df.loc[df.variation == base_dim][target_metric_key] + if sentinel_value == -1: + # as a fallback, use the minimal value as the sentinel + sentinel_value = df.loc[df[target_metric_key] != -1][target_metric_key].min() - col_opt, col_result, col_relative = "Variations", "Result", "%" - width_opt = max(len(k) for k in list(results.keys()) + [col_opt]) - width_metric = max(len(str(v)) for v in list(results.values()) + [col_result]) - width_percent = 5 + # create diff column + if sentinel_value != -1: + df[diff_key] = df.apply( + lambda r: int(100 * r[target_metric_key] / sentinel_value) if r[target_metric_key] != -1 else "✘", + axis="columns", + ) - if base_dim is not None and base_dim in results: - sentinel_value = results[base_dim] - else: - # if no match, use the minimal value as the sentinel - sentinel_value = min(v for v in results.values() if v != -1) - - print(f"| {col_opt:^{width_opt}} | {col_result:^{width_metric}} | {col_relative:^{width_percent}} |") - print(f"| {'-'*width_opt:{width_opt}} | {'-'*width_metric:{width_metric}} | {'-'*width_percent:{width_percent}} |") - for key, value in results.items(): - if value != -1: - percent = f"{int(100*value/sentinel_value)}%" - value = f"{value:.02f}" - else: - percent = "✘" - value = "✘" - print(f"| {key:{width_opt}} | {value:>{width_metric}} | {percent:>{width_percent}} |") + # deal with failed runs + df[target_metric_key] = df.apply( + lambda r: r[target_metric_key] if r[target_metric_key] != -1 else "✘", axis="columns" + ) + + # re-order columns + cols = [variation_key, target_metric_key, diff_key, *report_metric_keys] + df = df.reindex(cols, axis="columns") # reorder cols + + # capitalize + df = df.rename(str.capitalize, axis="columns") + + # make the cols as narrow as possible + linebreak = "
" if table_format == "github" else "\n" + df = df.rename(lambda c: c.replace("_", linebreak), axis="columns") + + print("\n*** Results:\n") + print(df.to_markdown(index=False)) + print(f"\nNote: each run's output is also logged under {output_dir}/*.std*.txt") + print("\nPlease include the following information with your benchmark post:") + print(get_versions()) def main(): @@ -193,11 +247,17 @@ def main(): help="Dimension base line arg. if None the minimal value will be used to compare against", ) parser.add_argument( - "--metric-key", + "--target-metric-key", default=None, type=str, required=True, - help="Metric key in output_dir/all_results.json, e.g., train_samples_per_second", + help="Target metric key in output_dir/all_results.json, e.g., train_samples_per_second", + ) + parser.add_argument( + "--report-metric-keys", + default="", + type=str, + help="Report metric keys - other metric keys from output_dir/all_results.json to report, e.g., train_loss. Use a single argument e.g., 'train_loss train_samples", ) parser.add_argument( "--repeat-times", @@ -205,6 +265,14 @@ def main(): type=int, help="How many times to re-run each combination - an average will be reported", ) + # table_format_choices + parser.add_argument( + "--table-format", + default="console", + type=str, + choices=["github", "console"], + help="Format the results table to render best in the destination use", + ) parser.add_argument( "--verbose", default=False, @@ -221,15 +289,31 @@ def main(): # build a cartesian product of dimensions and convert those back into cmd-line arg strings opts = list(map(" ".join, itertools.product(*dims))) + # split wanted keys + report_metric_keys = args.report_metric_keys.split() + print(f"\n*** Running {len(opts)} benchmarks:") print(f"Base command: {' '.join(base_cmd)}") - results = {} + opt_key = "variation" + results = [] for id, opt in enumerate(tqdm(opts, desc="Total completion: ", leave=False)): cmd = base_cmd + opt.split() - results[opt] = process_run(id + 1, cmd, opt, args.repeat_times, output_dir, args.metric_key, args.verbose) - - process_results(results, args.metric_key, args.base_dim) + results.append( + process_run( + id + 1, + cmd, + opt_key, + opt, + args.target_metric_key, + report_metric_keys, + args.repeat_times, + output_dir, + args.verbose, + ) + ) + + process_results(results, args.target_metric_key, report_metric_keys, args.base_dim, args.table_format, output_dir) if __name__ == "__main__": From ed801299441bcd4f3df0f6947bad258feb9afc12 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 27 Dec 2021 22:51:28 -0800 Subject: [PATCH 04/10] fix --- scripts/benchmark/trainer-benchmark.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py index 1cb0776b4880..7ac61c60478f 100755 --- a/scripts/benchmark/trainer-benchmark.py +++ b/scripts/benchmark/trainer-benchmark.py @@ -66,6 +66,7 @@ import argparse +import datetime import io import itertools import json @@ -77,8 +78,11 @@ from statistics import fmean import pandas as pd +import torch from tqdm import tqdm +import transformers + def get_base_cmd(args, output_dir): @@ -98,7 +102,7 @@ def get_base_cmd(args, output_dir): def process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, verbose): # enable to debug everything but the run itself, to do it fast and see the progress - if 1: + if 0: import random from random import randint from time import sleep @@ -134,6 +138,7 @@ def process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, repeat_times, output_dir, verbose): + results = [] metrics = [] preamble = f"{id}: {opt}" outcome = f"{preamble}: " @@ -143,6 +148,7 @@ def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, re result = single_run_metrics[target_metric_key] if result != -1: metrics.append(single_run_metrics) + results.append(result) outcome += "✓" else: outcome += "✘" @@ -153,7 +159,7 @@ def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, re mean_target = round(mean_metrics[target_metric_key], 2) results_str = f"{outcome} {mean_target}" if successful_runs > 1: - results_str += f" ({[round(x, 2) for x in mean_metrics[target_metric_key]]})" + results_str += f" {tuple(round(x, 2) for x in results)}" print(results_str) mean_metrics[opt_key] = opt return mean_metrics @@ -163,12 +169,6 @@ def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, re def get_versions(): - import datetime - - import torch - - import transformers - properties = torch.cuda.get_device_properties(torch.device("cuda")) return f""" Datetime : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} From 0325a54086730548e5b2e24dc48d6b92a3270b63 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 28 Dec 2021 21:02:06 -0800 Subject: [PATCH 05/10] mucho improved --- scripts/benchmark/trainer-benchmark.py | 236 ++++++++++++++++++------- 1 file changed, 170 insertions(+), 66 deletions(-) diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py index 7ac61c60478f..af527c59da88 100755 --- a/scripts/benchmark/trainer-benchmark.py +++ b/scripts/benchmark/trainer-benchmark.py @@ -5,51 +5,68 @@ # This tool can be used to run and compare multiple dimensions of the HF Trainers args # # The main idea is: -# ./trainer-benchmark.py --base-cmd '' \ -# --dims '--tf32 0 --tf32 1' '--fp16 0 --fp16 1 --bf16 1' \ -# --target-metric-key train_samples_per_second # -# --dims allows you to compare multiple dimensions. +# ./trainer-benchmark.py --base-cmd '' \ +# --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \ +# --target-metric-key train_samples_per_second # -# as the first dimension has 2 options and the second 3, this will run the trainer 6 times adding -# one of: +# --variations allows you to compare variations in multiple dimensions. # -# --tf32 0 --fp16 0 -# --tf32 0 --fp16 1 -# --tf32 0 --bf16 1 -# --tf32 1 --fp16 0 -# --tf32 1 --fp16 1 -# --tf32 1 --bf16 1 +# as the first dimention has 2 options and the second 3 in our example, this will run the trainer 6 +# times adding one of: +# +# 1. --tf32 0 --fp16 0 +# 2. --tf32 0 --fp16 1 +# 3. --tf32 0 --bf16 1 +# 4. --tf32 1 --fp16 0 +# 5. --tf32 1 --fp16 1 +# 6. --tf32 1 --bf16 1 # # and print the results. This is just a cartesian product - and more than 2 dimensions can be used. # +# In a simpler way the same can be accomplished as: +# +# --variations '|--tf32' '|--fp16|--bf16' +# +# the leading empty variation is valid variation. +# +# So here we get the following 6 variations: +# +# 1. +# 2. --fp16 +# 3. --bf16 +# 4. --tf32 +# 5. --tf32 --fp16 +# 6. --tf32 --bf16 +# +# # Here is a full example of a train: # -# CUDA_VISIBLE_DEVICES=0 ./scripts/benchmark/trainer-benchmark.py \ -# --base-cmd ' \ -# examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --output_dir output_dir \ -# --do_train --label_smoothing 0.1 --logging_strategy no --save_strategy no --per_device_train_batch_size 8 \ -# --max_source_length 512 --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \ -# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \ -# --source_prefix "translate English to Romanian: " --warmup_steps 50 \ -# --max_train_samples 5000 --dataloader_num_workers 2 \ -# ' \ -# --dims '--tf32 0 --tf32 1' '--fp16 0 --fp16 1 --bf16 1' \ -# --base-dim '--tf32 0 --fp16 0' \ -# --target-metric-key train_samples_per_second --repeat-times 1 +# CUDA_VISIBLE_DEVICES=0 ./scripts/benchmark/trainer-benchmark.py \ +# --base-cmd ' \ +# examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --output_dir output_dir \ +# --do_train --label_smoothing 0.1 --logging_strategy no --save_strategy no --per_device_train_batch_size 8 \ +# --max_source_length 512 --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \ +# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \ +# --source_prefix "translate English to Romanian: " --warmup_steps 50 \ +# --max_train_samples 5000 --dataloader_num_workers 2 \ +# ' \ +# --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \ +# --base-variation '--tf32 0 --fp16 0' \ +# --target-metric-key train_samples_per_second --repeat-times 1 # # and here is a possible output: +# # XXX: outdated! +# *** Results: train_samples_per_second # -# *** Results: train_samples_per_second -# -# | Variations | Result | % | -# | ----------------- | ------ | ----- | -# | --tf32 0 --fp16 0 | 31.95 | 100% | -# | --tf32 0 --fp16 1 | 47.88 | 149% | -# | --tf32 0 --bf16 1 | 35.04 | 109% | -# | --tf32 1 --fp16 0 | 35.47 | 111% | -# | --tf32 1 --fp16 1 | 47.82 | 149% | -# | --tf32 1 --bf16 1 | 35.11 | 109% | +# | Variations | Result | % | +# | ----------------- | ------ | ----- | +# | --tf32 0 --fp16 0 | 31.95 | 100% | +# | --tf32 0 --fp16 1 | 47.88 | 149% | +# | --tf32 0 --bf16 1 | 35.04 | 109% | +# | --tf32 1 --fp16 0 | 35.47 | 111% | +# | --tf32 1 --fp16 1 | 47.82 | 149% | +# | --tf32 1 --bf16 1 | 35.11 | 109% | # # So you can quickly compare the different outcomes. # @@ -58,7 +75,7 @@ # # By default it'll use the lowest result as the base line to use as 100% and then compare the rest to # it as can be seen from the table above, but you can also specify which combination is the one to use as -# the baseline, e.g., to change to another entry use: --base-dim '--tf32 1 --fp16 0' +# the baseline, e.g., to change to another entry use: --base-variation '--tf32 1 --fp16 0' # # --target-metric-key is there to tell the program which metrics to compare - the different metric keys are # inside output_dir/all_results.json. e.g., to measure eval performance instead of train use @@ -70,6 +87,8 @@ import io import itertools import json +import os +import platform import re import shlex import subprocess @@ -84,6 +103,61 @@ import transformers +class Tee: + """ + A helper class to tee print's output into a file. + Usage: + sys.stdout = Tee(filename) + """ + + def __init__(self, filename): + self.stdout = sys.stdout + self.file = open(filename, "a") + + def __getattr__(self, attr): + return getattr(self.stdout, attr) + + def write(self, msg): + self.stdout.write(msg) + # strip tqdm codes + self.file.write(re.sub(r"^.*\r", "", msg, 0, re.M)) + + +def get_orig_cmd(max_width=80, full_python_path=False): + """ + Return the original command line string that can be replayed nicely and wrapped for 80 char width + Args: + - max_width: the width to wrap for. defaults to 80 + - full_python_path: whether to replicate the full path or just the last part (i.e. `python`). default to `False` + """ + + cmd = [] + + # deal with critical env vars + env_keys = ["CUDA_VISIBLE_DEVICES"] + for key in env_keys: + val = os.environ.get(key, None) + if val is not None: + cmd.append(f"{key}={val}") + + # python executable (not always needed if the script is executable) + python = sys.executable if full_python_path else sys.executable.split("/")[-1] + cmd.append(python) + + # now the normal args + cmd += list(map(shlex.quote, sys.argv)) + + # split up into up to MAX_WIDTH lines with shell multi-line escapes + lines = [] + current_line = "" + while len(cmd) > 0: + current_line += f"{cmd.pop(0)} " + if len(cmd) == 0 or len(current_line) + len(cmd[0]) + 1 > max_width - 1: + lines.append(current_line) + current_line = "" + return "\\\n".join(lines) + + def get_base_cmd(args, output_dir): # unwrap multi-line input @@ -100,7 +174,7 @@ def get_base_cmd(args, output_dir): return [sys.executable] + shlex.split(args.base_cmd) -def process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, verbose): +def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose): # enable to debug everything but the run itself, to do it fast and see the progress if 0: import random @@ -119,7 +193,7 @@ def process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, print("STDERR", result.stderr) # save the streams - prefix = opt.replace(" ", "-") + prefix = variation.replace(" ", "-") with open(Path(output_dir) / f"{prefix}.stdout.txt", "w") as f: f.write(result.stdout) with open(Path(output_dir) / f"{prefix}.stderr.txt", "w") as f: @@ -137,14 +211,27 @@ def process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, return {k: v for k, v in metrics.items() if k in metric_keys} -def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, repeat_times, output_dir, verbose): +def process_run( + id, + cmd, + variation_key, + variation, + longest_variation_len, + target_metric_key, + report_metric_keys, + repeat_times, + output_dir, + verbose, +): results = [] metrics = [] - preamble = f"{id}: {opt}" + preamble = f"{id}: {variation:<{longest_variation_len}}" outcome = f"{preamble}: " metric_keys = set(report_metric_keys + [target_metric_key]) for i in tqdm(range(repeat_times), desc=preamble, leave=False): - single_run_metrics = process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, verbose) + single_run_metrics = process_run_single( + id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose + ) result = single_run_metrics[target_metric_key] if result != -1: metrics.append(single_run_metrics) @@ -161,34 +248,39 @@ def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, re if successful_runs > 1: results_str += f" {tuple(round(x, 2) for x in results)}" print(results_str) - mean_metrics[opt_key] = opt + mean_metrics[variation_key] = variation return mean_metrics else: print(outcome) - return {opt_key: opt, target_metric_key: -1} + return {variation_key: variation, target_metric_key: -1} def get_versions(): properties = torch.cuda.get_device_properties(torch.device("cuda")) return f""" Datetime : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} + +Software: transformers: {transformers.__version__} torch : {torch.__version__} cuda : {torch.version.cuda} +python : {platform.python_version()} + +Hardware: {torch.cuda.device_count()} GPUs : {properties.name}, {properties.total_memory/2**30:0.2f}GB """ -def process_results(results, target_metric_key, report_metric_keys, base_dim, table_format, output_dir): +def process_results(results, target_metric_key, report_metric_keys, base_variation, table_format, output_dir): df = pd.DataFrame(results) variation_key = "variation" diff_key = "diff_%" sentinel_value = -1 - if base_dim is not None and len(df[df.variation == base_dim]): + if base_variation is not None and len(df[df[variation_key] == base_variation]): # this may still return -1 - sentinel_value = df.loc[df.variation == base_dim][target_metric_key] + sentinel_value = df.loc[df[variation_key] == base_variation][target_metric_key].item() if sentinel_value == -1: # as a fallback, use the minimal value as the sentinel sentinel_value = df.loc[df[target_metric_key] != -1][target_metric_key].min() @@ -196,7 +288,7 @@ def process_results(results, target_metric_key, report_metric_keys, base_dim, ta # create diff column if sentinel_value != -1: df[diff_key] = df.apply( - lambda r: int(100 * r[target_metric_key] / sentinel_value) if r[target_metric_key] != -1 else "✘", + lambda r: round(100 * r[target_metric_key] / sentinel_value) if r[target_metric_key] != -1 else "✘", axis="columns", ) @@ -216,11 +308,9 @@ def process_results(results, target_metric_key, report_metric_keys, base_dim, ta linebreak = "
" if table_format == "github" else "\n" df = df.rename(lambda c: c.replace("_", linebreak), axis="columns") - print("\n*** Results:\n") - print(df.to_markdown(index=False)) - print(f"\nNote: each run's output is also logged under {output_dir}/*.std*.txt") - print("\nPlease include the following information with your benchmark post:") - print(get_versions()) + print("", "*** Results:", df.to_markdown(index=False), get_versions(), sep="\n\n") + + print("The benchmark command line was:", get_orig_cmd(), sep="\n\n") def main(): @@ -233,18 +323,18 @@ def main(): help="Base cmd", ) parser.add_argument( - "--dims", + "--variations", default=None, type=str, nargs="+", required=True, - help="Dimension args", + help="Multi-dimensional variations, example: '|--fp16|--bf16' '|--tf32'", ) parser.add_argument( - "--base-dim", + "--base-variation", default=None, type=str, - help="Dimension base line arg. if None the minimal value will be used to compare against", + help="Baseline variation to compare to. if None the minimal target value will be used to compare against", ) parser.add_argument( "--target-metric-key", @@ -263,7 +353,7 @@ def main(): "--repeat-times", default=1, type=int, - help="How many times to re-run each combination - an average will be reported", + help="How many times to re-run each variation - an average will be reported", ) # table_format_choices parser.add_argument( @@ -285,26 +375,38 @@ def main(): base_cmd = get_base_cmd(args, output_dir) # split each dimension into its --foo variations - dims = [list(map(str.strip, re.split(r"(?=--)", x)[1:])) for x in args.dims] - # build a cartesian product of dimensions and convert those back into cmd-line arg strings - opts = list(map(" ".join, itertools.product(*dims))) + dims = [list(map(str.strip, re.split(r"\|", x))) for x in args.variations] + # build a cartesian product of dimensions and convert those back into cmd-line arg strings, + # while stripping white space for inputs that were empty + variations = list(map(str.strip, map(" ".join, itertools.product(*dims)))) + longest_variation_len = max(len(x) for x in variations) # split wanted keys report_metric_keys = args.report_metric_keys.split() - print(f"\n*** Running {len(opts)} benchmarks:") + # capture prints into a log file for convenience + report_fn = f"benchmark-report-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt" + print(f"\nNote: each run's output is also logged under {output_dir}/*.std*.txt") + print(f"and this script's output is also piped into {report_fn}") + + sys.stdout = Tee(report_fn) + + print(f"\n*** Running {len(variations)} benchmarks:") print(f"Base command: {' '.join(base_cmd)}") - opt_key = "variation" + # keys = {} + + variation_key = "variation" results = [] - for id, opt in enumerate(tqdm(opts, desc="Total completion: ", leave=False)): - cmd = base_cmd + opt.split() + for id, variation in enumerate(tqdm(variations, desc="Total completion: ", leave=False)): + cmd = base_cmd + variation.split() results.append( process_run( id + 1, cmd, - opt_key, - opt, + variation_key, + variation, + longest_variation_len, args.target_metric_key, report_metric_keys, args.repeat_times, @@ -313,7 +415,9 @@ def main(): ) ) - process_results(results, args.target_metric_key, report_metric_keys, args.base_dim, args.table_format, output_dir) + process_results( + results, args.target_metric_key, report_metric_keys, args.base_variation, args.table_format, output_dir + ) if __name__ == "__main__": From 1c0f69fbfd4eb778d43302d713b8577b4c2373bc Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 29 Dec 2021 17:22:55 -0800 Subject: [PATCH 06/10] improved --- scripts/benchmark/trainer-benchmark.py | 152 +++++++++++++------------ 1 file changed, 80 insertions(+), 72 deletions(-) diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py index af527c59da88..62418516bbe4 100755 --- a/scripts/benchmark/trainer-benchmark.py +++ b/scripts/benchmark/trainer-benchmark.py @@ -4,6 +4,9 @@ # # This tool can be used to run and compare multiple dimensions of the HF Trainers args # +# It then prints a report once in github format with all the information that needs to be shared +# with others and second time in a console-friendly format, so it's easier to use for tuning things up. +# # The main idea is: # # ./trainer-benchmark.py --base-cmd '' \ @@ -24,49 +27,54 @@ # # and print the results. This is just a cartesian product - and more than 2 dimensions can be used. # -# In a simpler way the same can be accomplished as: -# -# --variations '|--tf32' '|--fp16|--bf16' +# If you want to rely on defaults, this: +# --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' +# is identical to this: +# --variations '--tf32 0|--tf32 1' '|--fp16|--bf16' # -# the leading empty variation is valid variation. +# the leading empty variation in the 2nd dimension is a valid variation. # # So here we get the following 6 variations: # -# 1. -# 2. --fp16 -# 3. --bf16 -# 4. --tf32 -# 5. --tf32 --fp16 -# 6. --tf32 --bf16 +# 1. --tf32 0 +# 2. --tf32 0 --fp16 +# 3. --tf32 0 --bf16 +# 4. --tf32 1 +# 5. --tf32 1 --fp16 +# 6. --tf32 1 --bf16 # +# In this particular case we don't know what the default tf32 setting is as it's normally +# pytorch-version dependent). That's why it's best to do an explicit setting of each variation: +# `--tf32 0|--tf32 1` # # Here is a full example of a train: # -# CUDA_VISIBLE_DEVICES=0 ./scripts/benchmark/trainer-benchmark.py \ -# --base-cmd ' \ -# examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --output_dir output_dir \ -# --do_train --label_smoothing 0.1 --logging_strategy no --save_strategy no --per_device_train_batch_size 8 \ -# --max_source_length 512 --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \ -# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \ -# --source_prefix "translate English to Romanian: " --warmup_steps 50 \ -# --max_train_samples 5000 --dataloader_num_workers 2 \ -# ' \ -# --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \ -# --base-variation '--tf32 0 --fp16 0' \ -# --target-metric-key train_samples_per_second --repeat-times 1 +# CUDA_VISIBLE_DEVICES=0 python ./scripts/benchmark/trainer-benchmark.py \ +# --base-cmd \ +# ' examples/pytorch/translation/run_translation.py --model_name_or_path t5-small \ +# --output_dir output_dir --do_train --label_smoothing 0.1 --logging_strategy no \ +# --save_strategy no --per_device_train_batch_size 32 --max_source_length 512 \ +# --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \ +# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \ +# --source_prefix "translate English to Romanian: " --warmup_steps 50 \ +# --max_train_samples 20000 --dataloader_num_workers 2 ' \ +# --target-metric-key train_samples_per_second --repeat-times 1 --variations \ +# '|--fp16|--bf16' '--tf32 0|--tf32 1' --report-metric-keys train_loss \ +# --repeat-times 1 --base-variation '--tf32 0' # # and here is a possible output: -# # XXX: outdated! -# *** Results: train_samples_per_second # -# | Variations | Result | % | -# | ----------------- | ------ | ----- | -# | --tf32 0 --fp16 0 | 31.95 | 100% | -# | --tf32 0 --fp16 1 | 47.88 | 149% | -# | --tf32 0 --bf16 1 | 35.04 | 109% | -# | --tf32 1 --fp16 0 | 35.47 | 111% | -# | --tf32 1 --fp16 1 | 47.82 | 149% | -# | --tf32 1 --bf16 1 | 35.11 | 109% | +# | Variation | Train | Diff | Train | +# | | samples | % | loss | +# | | per | | | +# | | second | | | +# |:----------------|----------:|-------:|--------:| +# | --tf32 0 | 286.07 | 100 | 2.51 | +# | --tf32 1 | 342.82 | 120 | 2.51 | +# | --fp16 --tf32 0 | 422.07 | 148 | 2.51 | +# | --fp16 --tf32 1 | 423.18 | 148 | 2.51 | +# | --bf16 --tf32 0 | 415.93 | 145 | 2.52 | +# | --bf16 --tf32 1 | 418.51 | 146 | 2.52 | # # So you can quickly compare the different outcomes. # @@ -78,15 +86,18 @@ # the baseline, e.g., to change to another entry use: --base-variation '--tf32 1 --fp16 0' # # --target-metric-key is there to tell the program which metrics to compare - the different metric keys are -# inside output_dir/all_results.json. e.g., to measure eval performance instead of train use -# --target-metric-key eval_samples_per_second - +# inside output_dir/all_results.json. e.g., to measure eval performance instead of train use: +# --target-metric-key eval_samples_per_second +# but of course you will need to adjust the --base-cmd value in the example to perform evaluation as +# well (as currently it doesn't) +# import argparse import datetime import io import itertools import json +import math import os import platform import re @@ -103,6 +114,9 @@ import transformers +nan = float("nan") + + class Tee: """ A helper class to tee print's output into a file. @@ -178,12 +192,12 @@ def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric # enable to debug everything but the run itself, to do it fast and see the progress if 0: import random - from random import randint from time import sleep sleep(0) return dict( - {k: randint(1, 30) for k in metric_keys}, **{target_metric_key: random.choice([-1, 10, 100, 55, 222])} + {k: random.uniform(0, 100) for k in metric_keys}, + **{target_metric_key: random.choice([nan, 10.31, 100.2, 55.6666, 222.22222222])}, ) result = subprocess.run(cmd, capture_output=True, text=True) @@ -202,7 +216,7 @@ def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric if result.returncode != 0: if verbose: print("failed") - return {target_metric_key: -1} + return {target_metric_key: nan} with io.open(f"{output_dir}/all_results.json", "r", encoding="utf-8") as f: metrics = json.load(f) @@ -233,26 +247,25 @@ def process_run( id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose ) result = single_run_metrics[target_metric_key] - if result != -1: + if not math.isnan(result): metrics.append(single_run_metrics) results.append(result) outcome += "✓" else: outcome += "✘" outcome = f"\33[2K\r{outcome}" - successful_runs = len(metrics) - if successful_runs > 0: - mean_metrics = {k: fmean([metrics[i][k] for i in range(successful_runs)]) for k in metrics[0].keys()} + if len(metrics) > 0: + mean_metrics = {k: fmean([x[k] for x in metrics]) for k in metrics[0].keys()} mean_target = round(mean_metrics[target_metric_key], 2) results_str = f"{outcome} {mean_target}" - if successful_runs > 1: + if len(metrics) > 1: results_str += f" {tuple(round(x, 2) for x in results)}" print(results_str) mean_metrics[variation_key] = variation return mean_metrics else: print(outcome) - return {variation_key: variation, target_metric_key: -1} + return {variation_key: variation, target_metric_key: nan} def get_versions(): @@ -271,32 +284,29 @@ def get_versions(): """ -def process_results(results, target_metric_key, report_metric_keys, base_variation, table_format, output_dir): +def process_results(results, target_metric_key, report_metric_keys, base_variation, output_dir): df = pd.DataFrame(results) variation_key = "variation" diff_key = "diff_%" - sentinel_value = -1 + sentinel_value = nan if base_variation is not None and len(df[df[variation_key] == base_variation]): - # this may still return -1 + # this may still return nan sentinel_value = df.loc[df[variation_key] == base_variation][target_metric_key].item() - if sentinel_value == -1: + if math.isnan(sentinel_value): # as a fallback, use the minimal value as the sentinel - sentinel_value = df.loc[df[target_metric_key] != -1][target_metric_key].min() + sentinel_value = df.loc[df[target_metric_key] != nan][target_metric_key].min() - # create diff column - if sentinel_value != -1: + # create diff column if possible + if not math.isnan(sentinel_value): df[diff_key] = df.apply( - lambda r: round(100 * r[target_metric_key] / sentinel_value) if r[target_metric_key] != -1 else "✘", + lambda r: round(100 * r[target_metric_key] / sentinel_value) + if not math.isnan(r[target_metric_key]) + else 0, axis="columns", ) - # deal with failed runs - df[target_metric_key] = df.apply( - lambda r: r[target_metric_key] if r[target_metric_key] != -1 else "✘", axis="columns" - ) - # re-order columns cols = [variation_key, target_metric_key, diff_key, *report_metric_keys] df = df.reindex(cols, axis="columns") # reorder cols @@ -305,12 +315,20 @@ def process_results(results, target_metric_key, report_metric_keys, base_variati df = df.rename(str.capitalize, axis="columns") # make the cols as narrow as possible - linebreak = "
" if table_format == "github" else "\n" - df = df.rename(lambda c: c.replace("_", linebreak), axis="columns") + df_github = df.rename(lambda c: c.replace("_", "
"), axis="columns") + df_console = df.rename(lambda c: c.replace("_", "\n"), axis="columns") - print("", "*** Results:", df.to_markdown(index=False), get_versions(), sep="\n\n") + report = ["", "Copy between the cut-here-lines and paste as is to github or a forum"] + report += ["----------8<-----------------8<--------"] + report += ["*** Results:", df_github.to_markdown(index=False, floatfmt=".2f")] + report += ["```"] + report += ["*** Setup:", get_versions()] + report += ["*** The benchmark command line was:", get_orig_cmd()] + report += ["```"] + report += ["----------8<-----------------8<--------"] + report += ["*** Results (console):", df_console.to_markdown(index=False, floatfmt=".2f")] - print("The benchmark command line was:", get_orig_cmd(), sep="\n\n") + print("\n\n".join(report)) def main(): @@ -355,14 +373,6 @@ def main(): type=int, help="How many times to re-run each variation - an average will be reported", ) - # table_format_choices - parser.add_argument( - "--table-format", - default="console", - type=str, - choices=["github", "console"], - help="Format the results table to render best in the destination use", - ) parser.add_argument( "--verbose", default=False, @@ -415,9 +425,7 @@ def main(): ) ) - process_results( - results, args.target_metric_key, report_metric_keys, args.base_variation, args.table_format, output_dir - ) + process_results(results, args.target_metric_key, report_metric_keys, args.base_variation, output_dir) if __name__ == "__main__": From bde5de4c9a52476860acd7da5834f7c1dd131220 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 3 Jan 2022 14:20:47 -0800 Subject: [PATCH 07/10] fix prefix --- scripts/benchmark/trainer-benchmark.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py index 62418516bbe4..a54fd002364c 100755 --- a/scripts/benchmark/trainer-benchmark.py +++ b/scripts/benchmark/trainer-benchmark.py @@ -208,9 +208,9 @@ def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric # save the streams prefix = variation.replace(" ", "-") - with open(Path(output_dir) / f"{prefix}.stdout.txt", "w") as f: + with open(Path(output_dir) / f"log.{prefix}.stdout.txt", "w") as f: f.write(result.stdout) - with open(Path(output_dir) / f"{prefix}.stderr.txt", "w") as f: + with open(Path(output_dir) / f"log.{prefix}.stderr.txt", "w") as f: f.write(result.stderr) if result.returncode != 0: @@ -396,7 +396,7 @@ def main(): # capture prints into a log file for convenience report_fn = f"benchmark-report-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt" - print(f"\nNote: each run's output is also logged under {output_dir}/*.std*.txt") + print(f"\nNote: each run's output is also logged under {output_dir}/log.*.std*.txt") print(f"and this script's output is also piped into {report_fn}") sys.stdout = Tee(report_fn) From 968d86f81a248352c43ce77c98d1cfc73b660ac9 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 3 Jan 2022 21:14:13 -0800 Subject: [PATCH 08/10] fix --- scripts/benchmark/trainer-benchmark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py index a54fd002364c..4fa268794855 100755 --- a/scripts/benchmark/trainer-benchmark.py +++ b/scripts/benchmark/trainer-benchmark.py @@ -382,6 +382,7 @@ def main(): args = parser.parse_args() output_dir = "output_benchmark" + Path(output_dir).mkdir(exist_ok=True) base_cmd = get_base_cmd(args, output_dir) # split each dimension into its --foo variations From b9e2a12cc7d1acf6c46b488e46cec3107fd328e5 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 4 Jan 2022 12:54:09 -0800 Subject: [PATCH 09/10] fix diff calculation --- scripts/benchmark/trainer-benchmark.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py index 4fa268794855..3188290a9dd4 100755 --- a/scripts/benchmark/trainer-benchmark.py +++ b/scripts/benchmark/trainer-benchmark.py @@ -64,17 +64,19 @@ # # and here is a possible output: # -# | Variation | Train | Diff | Train | -# | | samples | % | loss | -# | | per | | | -# | | second | | | -# |:----------------|----------:|-------:|--------:| -# | --tf32 0 | 286.07 | 100 | 2.51 | -# | --tf32 1 | 342.82 | 120 | 2.51 | -# | --fp16 --tf32 0 | 422.07 | 148 | 2.51 | -# | --fp16 --tf32 1 | 423.18 | 148 | 2.51 | -# | --bf16 --tf32 0 | 415.93 | 145 | 2.52 | -# | --bf16 --tf32 1 | 418.51 | 146 | 2.52 | +# +# | Variation | Train | Diff | Train | +# | | samples | % | loss | +# | | per | | | +# | | second | | | +# |:----------------|----------:|-------:|--------:| +# | --tf32 0 | 285.11 | 0 | 2.51 | +# | --tf32 1 | 342.09 | 20 | 2.51 | +# | --fp16 --tf32 0 | 423.49 | 49 | 2.51 | +# | --fp16 --tf32 1 | 423.13 | 48 | 2.51 | +# | --bf16 --tf32 0 | 416.80 | 46 | 2.52 | +# | --bf16 --tf32 1 | 415.87 | 46 | 2.52 | +# # # So you can quickly compare the different outcomes. # @@ -301,7 +303,7 @@ def process_results(results, target_metric_key, report_metric_keys, base_variati # create diff column if possible if not math.isnan(sentinel_value): df[diff_key] = df.apply( - lambda r: round(100 * r[target_metric_key] / sentinel_value) + lambda r: round(100 * (r[target_metric_key] - sentinel_value) / sentinel_value) if not math.isnan(r[target_metric_key]) else 0, axis="columns", From 6a875ea8b2511a54a2105bcb78426012c29021ea Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 15 Feb 2022 17:20:44 -0800 Subject: [PATCH 10/10] address suggestions --- scripts/benchmark/trainer-benchmark.py | 37 +++++++++++++++++--------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py index 3188290a9dd4..903b4e0dd6d5 100755 --- a/scripts/benchmark/trainer-benchmark.py +++ b/scripts/benchmark/trainer-benchmark.py @@ -2,7 +2,7 @@ # HF Trainer benchmarking tool # -# This tool can be used to run and compare multiple dimensions of the HF Trainers args +# This tool can be used to run and compare multiple dimensions of the HF Trainers args. # # It then prints a report once in github format with all the information that needs to be shared # with others and second time in a console-friendly format, so it's easier to use for tuning things up. @@ -13,6 +13,9 @@ # --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \ # --target-metric-key train_samples_per_second # +# The variations can be any command line argument that you want to compare and not just dtype as in +# the example. +# # --variations allows you to compare variations in multiple dimensions. # # as the first dimention has 2 options and the second 3 in our example, this will run the trainer 6 @@ -139,12 +142,15 @@ def write(self, msg): self.file.write(re.sub(r"^.*\r", "", msg, 0, re.M)) -def get_orig_cmd(max_width=80, full_python_path=False): +def get_original_command(max_width=80, full_python_path=False): """ - Return the original command line string that can be replayed nicely and wrapped for 80 char width + Return the original command line string that can be replayed nicely and wrapped for 80 char width. + Args: - - max_width: the width to wrap for. defaults to 80 - - full_python_path: whether to replicate the full path or just the last part (i.e. `python`). default to `False` + max_width (`int`, `optional`, defaults to 80): + The width to wrap for. + full_python_path (`bool`, `optional`, defaults to `False`): + Whether to replicate the full path or just the last segment (i.e. `python`). """ cmd = [] @@ -174,7 +180,7 @@ def get_orig_cmd(max_width=80, full_python_path=False): return "\\\n".join(lines) -def get_base_cmd(args, output_dir): +def get_base_command(args, output_dir): # unwrap multi-line input args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd) @@ -191,7 +197,10 @@ def get_base_cmd(args, output_dir): def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose): - # enable to debug everything but the run itself, to do it fast and see the progress + + # Enable to debug everything but the run itself, to do it fast and see the progress. + # This is useful for debugging the output formatting quickly - we can remove it later once + # everybody is happy with the output if 0: import random from time import sleep @@ -325,7 +334,7 @@ def process_results(results, target_metric_key, report_metric_keys, base_variati report += ["*** Results:", df_github.to_markdown(index=False, floatfmt=".2f")] report += ["```"] report += ["*** Setup:", get_versions()] - report += ["*** The benchmark command line was:", get_orig_cmd()] + report += ["*** The benchmark command line was:", get_original_command()] report += ["```"] report += ["----------8<-----------------8<--------"] report += ["*** Results (console):", df_console.to_markdown(index=False, floatfmt=".2f")] @@ -375,6 +384,12 @@ def main(): type=int, help="How many times to re-run each variation - an average will be reported", ) + parser.add_argument( + "--output_dir", + default="output_benchmark", + type=str, + help="The output directory where all the benchmark reports will go to and additionally this directory will be used to override --output_dir in the script that is being benchmarked", + ) parser.add_argument( "--verbose", default=False, @@ -383,9 +398,9 @@ def main(): ) args = parser.parse_args() - output_dir = "output_benchmark" + output_dir = args.output_dir Path(output_dir).mkdir(exist_ok=True) - base_cmd = get_base_cmd(args, output_dir) + base_cmd = get_base_command(args, output_dir) # split each dimension into its --foo variations dims = [list(map(str.strip, re.split(r"\|", x))) for x in args.variations] @@ -407,8 +422,6 @@ def main(): print(f"\n*** Running {len(variations)} benchmarks:") print(f"Base command: {' '.join(base_cmd)}") - # keys = {} - variation_key = "variation" results = [] for id, variation in enumerate(tqdm(variations, desc="Total completion: ", leave=False)):