From d1bd5ab8da3c28153f0366eb93cae70224acced4 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sun, 26 Dec 2021 17:33:25 -0800
Subject: [PATCH 01/10] [benchmark tool] trainer-benchmark.py

---
 scripts/benchmark/trainer-benchmark.py | 236 +++++++++++++++++++++++++
 1 file changed, 236 insertions(+)
 create mode 100755 scripts/benchmark/trainer-benchmark.py
diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
new file mode 100755
index 000000000000..06680b4a5df1
--- /dev/null
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python
+
+# HF Trainer benchmarking tool
+#
+# This tool can be used to run and compare multiple dimensions of the HF Trainers args
+#
+# The main idea is:
+# ./trainer-benchmark.py --base-cmd '<cmd args that don't change>' \
+# --dims '--tf32 0 --tf32 1' '--fp16 0 --fp16 1 --bf16 1' \
+# --metric-key train_samples_per_second
+#
+# --dims allows you to compare multiple dimensions.
+#
+# as the first dimension has 2 options and the second 3, this will run the trainer 6 times adding
+# one of:
+#
+# --tf32 0 --fp16 0
+# --tf32 0 --fp16 1
+# --tf32 0 --bf16 1
+# --tf32 1 --fp16 0
+# --tf32 1 --fp16 1
+# --tf32 1 --bf16 1
+#
+# and print the results. This is just a cartesian product - and more than 2 dimensions can be used.
+#
+# Here is a full example of a train:
+#
+# CUDA_VISIBLE_DEVICES=0 ./scripts/benchmark/trainer-benchmark.py \
+# --base-cmd ' \
+# examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --output_dir output_dir \
+# --do_train --label_smoothing 0.1 --logging_strategy no --save_strategy no --per_device_train_batch_size 8 \
+# --max_source_length 512 --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \
+# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \
+# --source_prefix "translate English to Romanian: "  --warmup_steps 50 \
+# --max_train_samples 5000 --dataloader_num_workers 2 \
+# ' \
+# --dims '--tf32 0 --tf32 1' '--fp16 0 --fp16 1 --bf16 1' \
+# --base-dim '--tf32 0 --fp16 0' \
+# --metric-key train_samples_per_second --repeat-times 1
+#
+# and here a possible output:
+#
+# *** Results: train_samples_per_second
+#
+# |    Variations     | Result |   %   |
+# | ----------------- | ------ | ----- |
+# | --tf32 0 --fp16 0 |  31.95 |  100% |
+# | --tf32 0 --fp16 1 |  47.88 |  149% |
+# | --tf32 0 --bf16 1 |  35.04 |  109% |
+# | --tf32 1 --fp16 0 |  35.47 |  111% |
+# | --tf32 1 --fp16 1 |  47.82 |  149% |
+# | --tf32 1 --bf16 1 |  35.11 |  109% |
+#
+# So you can quickly compare the different outcomes.
+#
+# Typically running each experiment once is enough, but if the environment is unstable you can
+# re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the average results.
+#
+# by default it'll use the worst result as the base line to use as 100% and then compare the rest to
+# it as can be seen from the table, but you can also specify which combination is the one to use as
+# the baseline, e.g., to change to another entry use: --base-dim '--tf32 1 --fp16 0'
+#
+# --metric-key is there to tell the program which metrics to compare - the different metric keys are
+# inside output_dir/all_results.json. e.g., to measure eval performance instead of train use
+# --metric-key eval_samples_per_second
+
+
+import argparse
+import io
+import itertools
+import json
+import re
+import shlex
+import subprocess
+import sys
+from statistics import fmean
+
+from tqdm import tqdm
+
+
+def get_base_cmd(args, output_dir):
+
+    # unwrap multi-line input
+    args.base_cmd = re.sub(r"\\", " ", args.base_cmd)
+    args.base_cmd = re.sub(r"\n", " ", args.base_cmd)
+
+    # remove --output_dir if any and set our own
+    args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd)
+    args.base_cmd += f"--output_dir {output_dir} "
+
+    # ensure we have --overwrite_output_dir
+    args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd)
+    args.base_cmd += "--overwrite_output_dir "
+
+    return [sys.executable] + shlex.split(args.base_cmd)
+
+
+def process_run(id, cmd, opt, repeat_times, output_dir, metric_key, verbose):
+    results = []
+    preamble = f"{id}: {opt}"
+    outcome = f"{preamble}: "
+    for i in tqdm(range(repeat_times), desc=preamble, leave=False):
+        result = process_run_single(id, cmd, opt, output_dir, metric_key, verbose)
+        if result != -1:
+            results.append(result)
+            outcome += "✓"
+        else:
+            outcome += "✘"
+    outcome = f"\33[2K\r{outcome}"
+    if len(results):
+        mean_result = round(fmean(results), 2)
+        results_str = f"{outcome} {mean_result}"
+        if len(results) > 1:
+            results_str += f" ({[round(x, 2) for x in results]})"
+        print(results_str)
+        return mean_result
+    else:
+        print(outcome)
+        return -1
+
+
+def process_run_single(id, cmd, opt, output_dir, metric_key, verbose):
+    # enable to debug everything but the run itself, to do it fast and see the progress
+    # from random import randint
+    # from time import sleep
+    # sleep(3)
+    # return randint(100, 300)
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if verbose:
+        print("STDOUT", result.stdout)
+        print("STDERR", result.stderr)
+
+    if result.returncode != 0:
+        if verbose:
+            print("failed")
+        return -1
+
+    filename = f"{output_dir}/all_results.json"
+    with io.open(filename, "r", encoding="utf-8") as f:
+        metrics = json.load(f)
+    return metrics[metric_key]
+
+
+def process_results(results, metric_key, base_dim):
+
+    print(f"\n*** Results: {metric_key}\n")
+
+    col_opt, col_result, col_relative = "Variations", "Result", "%"
+    width_opt = max(len(k) for k in list(results.keys()) + [col_opt])
+    width_metric = max(len(str(v)) for v in list(results.values()) + [col_result])
+    width_percent = 5
+
+    if base_dim is not None and base_dim in results:
+        sentinel_value = results[base_dim]
+    else:
+        # if no match, use the minimal value as the sentinel
+        sentinel_value = min(v for v in results.values() if v != -1)
+
+    print(f"| {col_opt:^{width_opt}} | {col_result:^{width_metric}} | {col_relative:^{width_percent}} |")
+    print(f"| {'-'*width_opt:{width_opt}} | {'-'*width_metric:{width_metric}} | {'-'*width_percent:{width_percent}} |")
+    for key, value in results.items():
+        if value != -1:
+            percent = f"{int(100*value/sentinel_value)}%"
+            value = f"{value:.02f}"
+        else:
+            percent = "✘"
+            value = "✘"
+        print(f"| {key:{width_opt}} | {value:>{width_metric}} | {percent:>{width_percent}} |")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base-cmd",
+        default=None,
+        type=str,
+        required=True,
+        help="Base cmd",
+    )
+    parser.add_argument(
+        "--dims",
+        default=None,
+        type=str,
+        nargs="+",
+        required=True,
+        help="Dimension args",
+    )
+    parser.add_argument(
+        "--base-dim",
+        default=None,
+        type=str,
+        help="Dimension base line arg. if None the minimal value will be used to compare against",
+    )
+    parser.add_argument(
+        "--metric-key",
+        default=None,
+        type=str,
+        required=True,
+        help="Metric key in output_dir/all_results.json, e.g., train_samples_per_second",
+    )
+    parser.add_argument(
+        "--repeat-times",
+        default=1,
+        type=int,
+        help="How many times to re-run each combination - an average will be reported",
+    )
+    parser.add_argument(
+        "--verbose",
+        default=False,
+        action="store_true",
+        help="Whether to show the outputs of each run or just the benchmark progress",
+    )
+    args = parser.parse_args()
+
+    output_dir = "output_benchmark"
+    base_cmd = get_base_cmd(args, output_dir)
+
+    results = {}
+    dims = [list(map(str.strip, re.split(r"(?=--)", x)[1:])) for x in args.dims]
+    # cartesian product of dimensions and then converted back into cmd-line arg strings
+    opts = list(map(" ".join, itertools.product(*dims)))
+
+    print(f"\n*** Running {len(opts)} benchmarks:")
+    print(f"Base command: {' '.join(base_cmd)}")
+
+    for id, opt in enumerate(tqdm(opts, desc="Total completion: ", leave=False)):
+        cmd = base_cmd + opt.split()
+        results[opt] = process_run(id + 1, cmd, opt, args.repeat_times, output_dir, args.metric_key, args.verbose)
+
+    process_results(results, args.metric_key, args.base_dim)
+
+
+if __name__ == "__main__":
+    main()

From d1c9ad2cf169ea08ffdeb98d7314bf69febaddb7 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 27 Dec 2021 11:36:55 -0800
Subject: [PATCH 02/10] improve

---
 scripts/benchmark/trainer-benchmark.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index 06680b4a5df1..c325c3f5789a 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -38,7 +38,7 @@
 # --base-dim '--tf32 0 --fp16 0' \
 # --metric-key train_samples_per_second --repeat-times 1
 #
-# and here a possible output:
+# and here is a possible output:
 #
 # *** Results: train_samples_per_second
 #
@@ -54,10 +54,10 @@
 # So you can quickly compare the different outcomes.
 #
 # Typically running each experiment once is enough, but if the environment is unstable you can
-# re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the average results.
+# re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the averaged results.
 #
-# by default it'll use the worst result as the base line to use as 100% and then compare the rest to
-# it as can be seen from the table, but you can also specify which combination is the one to use as
+# By default it'll use the worst result as the base line to use as 100% and then compare the rest to
+# it as can be seen from the table above, but you can also specify which combination is the one to use as
 # the baseline, e.g., to change to another entry use: --base-dim '--tf32 1 --fp16 0'
 #
 # --metric-key is there to tell the program which metrics to compare - the different metric keys are
@@ -81,16 +81,15 @@
 def get_base_cmd(args, output_dir):
 
     # unwrap multi-line input
-    args.base_cmd = re.sub(r"\\", " ", args.base_cmd)
-    args.base_cmd = re.sub(r"\n", " ", args.base_cmd)
+    args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd)
 
     # remove --output_dir if any and set our own
     args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd)
-    args.base_cmd += f"--output_dir {output_dir} "
+    args.base_cmd += f" --output_dir {output_dir}"
 
     # ensure we have --overwrite_output_dir
     args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd)
-    args.base_cmd += "--overwrite_output_dir "
+    args.base_cmd += " --overwrite_output_dir"
 
     return [sys.executable] + shlex.split(args.base_cmd)
 
@@ -217,14 +216,15 @@ def main():
     output_dir = "output_benchmark"
     base_cmd = get_base_cmd(args, output_dir)
 
-    results = {}
+    # split each dimension into its --foo variations
     dims = [list(map(str.strip, re.split(r"(?=--)", x)[1:])) for x in args.dims]
-    # cartesian product of dimensions and then converted back into cmd-line arg strings
+    # build a cartesian product of dimensions and convert those back into cmd-line arg strings
     opts = list(map(" ".join, itertools.product(*dims)))
 
     print(f"\n*** Running {len(opts)} benchmarks:")
     print(f"Base command: {' '.join(base_cmd)}")
 
+    results = {}
     for id, opt in enumerate(tqdm(opts, desc="Total completion: ", leave=False)):
         cmd = base_cmd + opt.split()
         results[opt] = process_run(id + 1, cmd, opt, args.repeat_times, output_dir, args.metric_key, args.verbose)

From 51929f047371cb0e7e75c1dca9106ee1e737414d Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 27 Dec 2021 22:39:05 -0800
Subject: [PATCH 03/10] massive rework/expansion

---
 scripts/benchmark/trainer-benchmark.py | 208 +++++++++++++++++--------
 1 file changed, 146 insertions(+), 62 deletions(-)

diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index c325c3f5789a..1cb0776b4880 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -7,7 +7,7 @@
 # The main idea is:
 # ./trainer-benchmark.py --base-cmd '<cmd args that don't change>' \
 # --dims '--tf32 0 --tf32 1' '--fp16 0 --fp16 1 --bf16 1' \
-# --metric-key train_samples_per_second
+# --target-metric-key train_samples_per_second
 #
 # --dims allows you to compare multiple dimensions.
 #
@@ -36,7 +36,7 @@
 # ' \
 # --dims '--tf32 0 --tf32 1' '--fp16 0 --fp16 1 --bf16 1' \
 # --base-dim '--tf32 0 --fp16 0' \
-# --metric-key train_samples_per_second --repeat-times 1
+# --target-metric-key train_samples_per_second --repeat-times 1
 #
 # and here is a possible output:
 #
@@ -56,13 +56,13 @@
 # Typically running each experiment once is enough, but if the environment is unstable you can
 # re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the averaged results.
 #
-# By default it'll use the worst result as the base line to use as 100% and then compare the rest to
+# By default it'll use the lowest result as the base line to use as 100% and then compare the rest to
 # it as can be seen from the table above, but you can also specify which combination is the one to use as
 # the baseline, e.g., to change to another entry use: --base-dim '--tf32 1 --fp16 0'
 #
-# --metric-key is there to tell the program which metrics to compare - the different metric keys are
+# --target-metric-key is there to tell the program which metrics to compare - the different metric keys are
 # inside output_dir/all_results.json. e.g., to measure eval performance instead of train use
-# --metric-key eval_samples_per_second
+# --target-metric-key eval_samples_per_second
 
 
 import argparse
@@ -73,8 +73,10 @@
 import shlex
 import subprocess
 import sys
+from pathlib import Path
 from statistics import fmean
 
+import pandas as pd
 from tqdm import tqdm
 
 
@@ -94,79 +96,131 @@ def get_base_cmd(args, output_dir):
     return [sys.executable] + shlex.split(args.base_cmd)
 
 
-def process_run(id, cmd, opt, repeat_times, output_dir, metric_key, verbose):
-    results = []
+def process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, verbose):
+    # enable to debug everything but the run itself, to do it fast and see the progress
+    if 1:
+        import random
+        from random import randint
+        from time import sleep
+
+        sleep(0)
+        return dict(
+            {k: randint(1, 30) for k in metric_keys}, **{target_metric_key: random.choice([-1, 10, 100, 55, 222])}
+        )
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if verbose:
+        print("STDOUT", result.stdout)
+        print("STDERR", result.stderr)
+
+    # save the streams
+    prefix = opt.replace(" ", "-")
+    with open(Path(output_dir) / f"{prefix}.stdout.txt", "w") as f:
+        f.write(result.stdout)
+    with open(Path(output_dir) / f"{prefix}.stderr.txt", "w") as f:
+        f.write(result.stderr)
+
+    if result.returncode != 0:
+        if verbose:
+            print("failed")
+        return {target_metric_key: -1}
+
+    with io.open(f"{output_dir}/all_results.json", "r", encoding="utf-8") as f:
+        metrics = json.load(f)
+
+    # filter out just the keys we want
+    return {k: v for k, v in metrics.items() if k in metric_keys}
+
+
+def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, repeat_times, output_dir, verbose):
+    metrics = []
     preamble = f"{id}: {opt}"
     outcome = f"{preamble}: "
+    metric_keys = set(report_metric_keys + [target_metric_key])
     for i in tqdm(range(repeat_times), desc=preamble, leave=False):
-        result = process_run_single(id, cmd, opt, output_dir, metric_key, verbose)
+        single_run_metrics = process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, verbose)
+        result = single_run_metrics[target_metric_key]
         if result != -1:
-            results.append(result)
+            metrics.append(single_run_metrics)
             outcome += "✓"
         else:
             outcome += "✘"
     outcome = f"\33[2K\r{outcome}"
-    if len(results):
-        mean_result = round(fmean(results), 2)
-        results_str = f"{outcome} {mean_result}"
-        if len(results) > 1:
-            results_str += f" ({[round(x, 2) for x in results]})"
+    successful_runs = len(metrics)
+    if successful_runs > 0:
+        mean_metrics = {k: fmean([metrics[i][k] for i in range(successful_runs)]) for k in metrics[0].keys()}
+        mean_target = round(mean_metrics[target_metric_key], 2)
+        results_str = f"{outcome} {mean_target}"
+        if successful_runs > 1:
+            results_str += f" ({[round(x, 2) for x in mean_metrics[target_metric_key]]})"
         print(results_str)
-        return mean_result
+        mean_metrics[opt_key] = opt
+        return mean_metrics
     else:
         print(outcome)
-        return -1
+        return {opt_key: opt, target_metric_key: -1}
 
 
-def process_run_single(id, cmd, opt, output_dir, metric_key, verbose):
-    # enable to debug everything but the run itself, to do it fast and see the progress
-    # from random import randint
-    # from time import sleep
-    # sleep(3)
-    # return randint(100, 300)
+def get_versions():
+    import datetime
 
-    result = subprocess.run(cmd, capture_output=True, text=True)
+    import torch
 
-    if verbose:
-        print("STDOUT", result.stdout)
-        print("STDERR", result.stderr)
+    import transformers
 
-    if result.returncode != 0:
-        if verbose:
-            print("failed")
-        return -1
+    properties = torch.cuda.get_device_properties(torch.device("cuda"))
+    return f"""
+Datetime    : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+transformers: {transformers.__version__}
+torch       : {torch.__version__}
+cuda        : {torch.version.cuda}
+{torch.cuda.device_count()} GPUs      : {properties.name}, {properties.total_memory/2**30:0.2f}GB
+"""
 
-    filename = f"{output_dir}/all_results.json"
-    with io.open(filename, "r", encoding="utf-8") as f:
-        metrics = json.load(f)
-    return metrics[metric_key]
 
+def process_results(results, target_metric_key, report_metric_keys, base_dim, table_format, output_dir):
 
-def process_results(results, metric_key, base_dim):
+    df = pd.DataFrame(results)
+    variation_key = "variation"
+    diff_key = "diff_%"
 
-    print(f"\n*** Results: {metric_key}\n")
+    sentinel_value = -1
+    if base_dim is not None and len(df[df.variation == base_dim]):
+        # this may still return -1
+        sentinel_value = df.loc[df.variation == base_dim][target_metric_key]
+    if sentinel_value == -1:
+        # as a fallback, use the minimal value as the sentinel
+        sentinel_value = df.loc[df[target_metric_key] != -1][target_metric_key].min()
 
-    col_opt, col_result, col_relative = "Variations", "Result", "%"
-    width_opt = max(len(k) for k in list(results.keys()) + [col_opt])
-    width_metric = max(len(str(v)) for v in list(results.values()) + [col_result])
-    width_percent = 5
+    # create diff column
+    if sentinel_value != -1:
+        df[diff_key] = df.apply(
+            lambda r: int(100 * r[target_metric_key] / sentinel_value) if r[target_metric_key] != -1 else "✘",
+            axis="columns",
+        )
 
-    if base_dim is not None and base_dim in results:
-        sentinel_value = results[base_dim]
-    else:
-        # if no match, use the minimal value as the sentinel
-        sentinel_value = min(v for v in results.values() if v != -1)
-
-    print(f"| {col_opt:^{width_opt}} | {col_result:^{width_metric}} | {col_relative:^{width_percent}} |")
-    print(f"| {'-'*width_opt:{width_opt}} | {'-'*width_metric:{width_metric}} | {'-'*width_percent:{width_percent}} |")
-    for key, value in results.items():
-        if value != -1:
-            percent = f"{int(100*value/sentinel_value)}%"
-            value = f"{value:.02f}"
-        else:
-            percent = "✘"
-            value = "✘"
-        print(f"| {key:{width_opt}} | {value:>{width_metric}} | {percent:>{width_percent}} |")
+    # deal with failed runs
+    df[target_metric_key] = df.apply(
+        lambda r: r[target_metric_key] if r[target_metric_key] != -1 else "✘", axis="columns"
+    )
+
+    # re-order columns
+    cols = [variation_key, target_metric_key, diff_key, *report_metric_keys]
+    df = df.reindex(cols, axis="columns")  # reorder cols
+
+    # capitalize
+    df = df.rename(str.capitalize, axis="columns")
+
+    # make the cols as narrow as possible
+    linebreak = "<br>" if table_format == "github" else "\n"
+    df = df.rename(lambda c: c.replace("_", linebreak), axis="columns")
+
+    print("\n*** Results:\n")
+    print(df.to_markdown(index=False))
+    print(f"\nNote: each run's output is also logged under {output_dir}/*.std*.txt")
+    print("\nPlease include the following information with your benchmark post:")
+    print(get_versions())
 
 
 def main():
@@ -193,11 +247,17 @@ def main():
         help="Dimension base line arg. if None the minimal value will be used to compare against",
     )
     parser.add_argument(
-        "--metric-key",
+        "--target-metric-key",
         default=None,
         type=str,
         required=True,
-        help="Metric key in output_dir/all_results.json, e.g., train_samples_per_second",
+        help="Target metric key in output_dir/all_results.json, e.g., train_samples_per_second",
+    )
+    parser.add_argument(
+        "--report-metric-keys",
+        default="",
+        type=str,
+        help="Report metric keys - other metric keys from output_dir/all_results.json to report, e.g., train_loss. Use a single argument e.g., 'train_loss train_samples",
     )
     parser.add_argument(
         "--repeat-times",
@@ -205,6 +265,14 @@ def main():
         type=int,
         help="How many times to re-run each combination - an average will be reported",
     )
+    # table_format_choices
+    parser.add_argument(
+        "--table-format",
+        default="console",
+        type=str,
+        choices=["github", "console"],
+        help="Format the results table to render best in the destination use",
+    )
     parser.add_argument(
         "--verbose",
         default=False,
@@ -221,15 +289,31 @@ def main():
     # build a cartesian product of dimensions and convert those back into cmd-line arg strings
     opts = list(map(" ".join, itertools.product(*dims)))
 
+    # split wanted keys
+    report_metric_keys = args.report_metric_keys.split()
+
     print(f"\n*** Running {len(opts)} benchmarks:")
     print(f"Base command: {' '.join(base_cmd)}")
 
-    results = {}
+    opt_key = "variation"
+    results = []
     for id, opt in enumerate(tqdm(opts, desc="Total completion: ", leave=False)):
         cmd = base_cmd + opt.split()
-        results[opt] = process_run(id + 1, cmd, opt, args.repeat_times, output_dir, args.metric_key, args.verbose)
-
-    process_results(results, args.metric_key, args.base_dim)
+        results.append(
+            process_run(
+                id + 1,
+                cmd,
+                opt_key,
+                opt,
+                args.target_metric_key,
+                report_metric_keys,
+                args.repeat_times,
+                output_dir,
+                args.verbose,
+            )
+        )
+
+    process_results(results, args.target_metric_key, report_metric_keys, args.base_dim, args.table_format, output_dir)
 
 
 if __name__ == "__main__":

From ed801299441bcd4f3df0f6947bad258feb9afc12 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 27 Dec 2021 22:51:28 -0800
Subject: [PATCH 04/10] fix

---
 scripts/benchmark/trainer-benchmark.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index 1cb0776b4880..7ac61c60478f 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -66,6 +66,7 @@
 
 
 import argparse
+import datetime
 import io
 import itertools
 import json
@@ -77,8 +78,11 @@
 from statistics import fmean
 
 import pandas as pd
+import torch
 from tqdm import tqdm
 
+import transformers
+
 
 def get_base_cmd(args, output_dir):
 
@@ -98,7 +102,7 @@ def get_base_cmd(args, output_dir):
 
 def process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, verbose):
     # enable to debug everything but the run itself, to do it fast and see the progress
-    if 1:
+    if 0:
         import random
         from random import randint
         from time import sleep
@@ -134,6 +138,7 @@ def process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys,
 
 
 def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, repeat_times, output_dir, verbose):
+    results = []
     metrics = []
     preamble = f"{id}: {opt}"
     outcome = f"{preamble}: "
@@ -143,6 +148,7 @@ def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, re
         result = single_run_metrics[target_metric_key]
         if result != -1:
             metrics.append(single_run_metrics)
+            results.append(result)
             outcome += "✓"
         else:
             outcome += "✘"
@@ -153,7 +159,7 @@ def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, re
         mean_target = round(mean_metrics[target_metric_key], 2)
         results_str = f"{outcome} {mean_target}"
         if successful_runs > 1:
-            results_str += f" ({[round(x, 2) for x in mean_metrics[target_metric_key]]})"
+            results_str += f" {tuple(round(x, 2) for x in results)}"
         print(results_str)
         mean_metrics[opt_key] = opt
         return mean_metrics
@@ -163,12 +169,6 @@ def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, re
 
 
 def get_versions():
-    import datetime
-
-    import torch
-
-    import transformers
-
     properties = torch.cuda.get_device_properties(torch.device("cuda"))
     return f"""
 Datetime    : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

From 0325a54086730548e5b2e24dc48d6b92a3270b63 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 28 Dec 2021 21:02:06 -0800
Subject: [PATCH 05/10] mucho improved

---
 scripts/benchmark/trainer-benchmark.py | 236 ++++++++++++++++++-------
 1 file changed, 170 insertions(+), 66 deletions(-)

diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index 7ac61c60478f..af527c59da88 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -5,51 +5,68 @@
 # This tool can be used to run and compare multiple dimensions of the HF Trainers args
 #
 # The main idea is:
-# ./trainer-benchmark.py --base-cmd '<cmd args that don't change>' \
-# --dims '--tf32 0 --tf32 1' '--fp16 0 --fp16 1 --bf16 1' \
-# --target-metric-key train_samples_per_second
 #
-# --dims allows you to compare multiple dimensions.
+#     ./trainer-benchmark.py --base-cmd '<cmd args that don't change>' \
+#     --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \
+#     --target-metric-key train_samples_per_second
 #
-# as the first dimension has 2 options and the second 3, this will run the trainer 6 times adding
-# one of:
+# --variations allows you to compare variations in multiple dimensions.
 #
-# --tf32 0 --fp16 0
-# --tf32 0 --fp16 1
-# --tf32 0 --bf16 1
-# --tf32 1 --fp16 0
-# --tf32 1 --fp16 1
-# --tf32 1 --bf16 1
+# as the first dimention has 2 options and the second 3 in our example, this will run the trainer 6
+# times adding one of:
+#
+#    1. --tf32 0 --fp16 0
+#    2. --tf32 0 --fp16 1
+#    3. --tf32 0 --bf16 1
+#    4. --tf32 1 --fp16 0
+#    5. --tf32 1 --fp16 1
+#    6. --tf32 1 --bf16 1
 #
 # and print the results. This is just a cartesian product - and more than 2 dimensions can be used.
 #
+# In a simpler way the same can be accomplished as:
+#
+#     --variations '|--tf32' '|--fp16|--bf16'
+#
+# the leading empty variation is valid variation.
+#
+# So here we get the following 6 variations:
+#
+#    1.
+#    2. --fp16
+#    3. --bf16
+#    4. --tf32
+#    5. --tf32 --fp16
+#    6. --tf32 --bf16
+#
+#
 # Here is a full example of a train:
 #
-# CUDA_VISIBLE_DEVICES=0 ./scripts/benchmark/trainer-benchmark.py \
-# --base-cmd ' \
-# examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --output_dir output_dir \
-# --do_train --label_smoothing 0.1 --logging_strategy no --save_strategy no --per_device_train_batch_size 8 \
-# --max_source_length 512 --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \
-# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \
-# --source_prefix "translate English to Romanian: "  --warmup_steps 50 \
-# --max_train_samples 5000 --dataloader_num_workers 2 \
-# ' \
-# --dims '--tf32 0 --tf32 1' '--fp16 0 --fp16 1 --bf16 1' \
-# --base-dim '--tf32 0 --fp16 0' \
-# --target-metric-key train_samples_per_second --repeat-times 1
+#     CUDA_VISIBLE_DEVICES=0 ./scripts/benchmark/trainer-benchmark.py \
+#     --base-cmd ' \
+#     examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --output_dir output_dir \
+#     --do_train --label_smoothing 0.1 --logging_strategy no --save_strategy no --per_device_train_batch_size 8 \
+#     --max_source_length 512 --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \
+#     --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \
+#     --source_prefix "translate English to Romanian: "  --warmup_steps 50 \
+#     --max_train_samples 5000 --dataloader_num_workers 2 \
+#     ' \
+#     --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \
+#     --base-variation '--tf32 0 --fp16 0' \
+#     --target-metric-key train_samples_per_second --repeat-times 1
 #
 # and here is a possible output:
+# # XXX: outdated!
+#     *** Results: train_samples_per_second
 #
-# *** Results: train_samples_per_second
-#
-# |    Variations     | Result |   %   |
-# | ----------------- | ------ | ----- |
-# | --tf32 0 --fp16 0 |  31.95 |  100% |
-# | --tf32 0 --fp16 1 |  47.88 |  149% |
-# | --tf32 0 --bf16 1 |  35.04 |  109% |
-# | --tf32 1 --fp16 0 |  35.47 |  111% |
-# | --tf32 1 --fp16 1 |  47.82 |  149% |
-# | --tf32 1 --bf16 1 |  35.11 |  109% |
+#     |    Variations     | Result |   %   |
+#     | ----------------- | ------ | ----- |
+#     | --tf32 0 --fp16 0 |  31.95 |  100% |
+#     | --tf32 0 --fp16 1 |  47.88 |  149% |
+#     | --tf32 0 --bf16 1 |  35.04 |  109% |
+#     | --tf32 1 --fp16 0 |  35.47 |  111% |
+#     | --tf32 1 --fp16 1 |  47.82 |  149% |
+#     | --tf32 1 --bf16 1 |  35.11 |  109% |
 #
 # So you can quickly compare the different outcomes.
 #
@@ -58,7 +75,7 @@
 #
 # By default it'll use the lowest result as the base line to use as 100% and then compare the rest to
 # it as can be seen from the table above, but you can also specify which combination is the one to use as
-# the baseline, e.g., to change to another entry use: --base-dim '--tf32 1 --fp16 0'
+# the baseline, e.g., to change to another entry use: --base-variation '--tf32 1 --fp16 0'
 #
 # --target-metric-key is there to tell the program which metrics to compare - the different metric keys are
 # inside output_dir/all_results.json. e.g., to measure eval performance instead of train use
@@ -70,6 +87,8 @@
 import io
 import itertools
 import json
+import os
+import platform
 import re
 import shlex
 import subprocess
@@ -84,6 +103,61 @@
 import transformers
 
 
+class Tee:
+    """
+    A helper class to tee print's output into a file.
+    Usage:
+    sys.stdout = Tee(filename)
+    """
+
+    def __init__(self, filename):
+        self.stdout = sys.stdout
+        self.file = open(filename, "a")
+
+    def __getattr__(self, attr):
+        return getattr(self.stdout, attr)
+
+    def write(self, msg):
+        self.stdout.write(msg)
+        # strip tqdm codes
+        self.file.write(re.sub(r"^.*\r", "", msg, 0, re.M))
+
+
+def get_orig_cmd(max_width=80, full_python_path=False):
+    """
+    Return the original command line string that can be replayed nicely and wrapped for 80 char width
+    Args:
+        - max_width: the width to wrap for. defaults to 80
+        - full_python_path: whether to replicate the full path or just the last part (i.e. `python`). default to `False`
+    """
+
+    cmd = []
+
+    # deal with critical env vars
+    env_keys = ["CUDA_VISIBLE_DEVICES"]
+    for key in env_keys:
+        val = os.environ.get(key, None)
+        if val is not None:
+            cmd.append(f"{key}={val}")
+
+    # python executable (not always needed if the script is executable)
+    python = sys.executable if full_python_path else sys.executable.split("/")[-1]
+    cmd.append(python)
+
+    # now the normal args
+    cmd += list(map(shlex.quote, sys.argv))
+
+    # split up into up to MAX_WIDTH lines with shell multi-line escapes
+    lines = []
+    current_line = ""
+    while len(cmd) > 0:
+        current_line += f"{cmd.pop(0)} "
+        if len(cmd) == 0 or len(current_line) + len(cmd[0]) + 1 > max_width - 1:
+            lines.append(current_line)
+            current_line = ""
+    return "\\\n".join(lines)
+
+
 def get_base_cmd(args, output_dir):
 
     # unwrap multi-line input
@@ -100,7 +174,7 @@ def get_base_cmd(args, output_dir):
     return [sys.executable] + shlex.split(args.base_cmd)
 
 
-def process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, verbose):
+def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose):
     # enable to debug everything but the run itself, to do it fast and see the progress
     if 0:
         import random
@@ -119,7 +193,7 @@ def process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys,
         print("STDERR", result.stderr)
 
     # save the streams
-    prefix = opt.replace(" ", "-")
+    prefix = variation.replace(" ", "-")
     with open(Path(output_dir) / f"{prefix}.stdout.txt", "w") as f:
         f.write(result.stdout)
     with open(Path(output_dir) / f"{prefix}.stderr.txt", "w") as f:
@@ -137,14 +211,27 @@ def process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys,
     return {k: v for k, v in metrics.items() if k in metric_keys}
 
 
-def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, repeat_times, output_dir, verbose):
+def process_run(
+    id,
+    cmd,
+    variation_key,
+    variation,
+    longest_variation_len,
+    target_metric_key,
+    report_metric_keys,
+    repeat_times,
+    output_dir,
+    verbose,
+):
     results = []
     metrics = []
-    preamble = f"{id}: {opt}"
+    preamble = f"{id}: {variation:<{longest_variation_len}}"
     outcome = f"{preamble}: "
     metric_keys = set(report_metric_keys + [target_metric_key])
     for i in tqdm(range(repeat_times), desc=preamble, leave=False):
-        single_run_metrics = process_run_single(id, cmd, opt, output_dir, target_metric_key, metric_keys, verbose)
+        single_run_metrics = process_run_single(
+            id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose
+        )
         result = single_run_metrics[target_metric_key]
         if result != -1:
             metrics.append(single_run_metrics)
@@ -161,34 +248,39 @@ def process_run(id, cmd, opt_key, opt, target_metric_key, report_metric_keys, re
         if successful_runs > 1:
             results_str += f" {tuple(round(x, 2) for x in results)}"
         print(results_str)
-        mean_metrics[opt_key] = opt
+        mean_metrics[variation_key] = variation
         return mean_metrics
     else:
         print(outcome)
-        return {opt_key: opt, target_metric_key: -1}
+        return {variation_key: variation, target_metric_key: -1}
 
 
 def get_versions():
     properties = torch.cuda.get_device_properties(torch.device("cuda"))
     return f"""
 Datetime    : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+
+Software:
 transformers: {transformers.__version__}
 torch       : {torch.__version__}
 cuda        : {torch.version.cuda}
+python      : {platform.python_version()}
+
+Hardware:
 {torch.cuda.device_count()} GPUs      : {properties.name}, {properties.total_memory/2**30:0.2f}GB
 """
 
 
-def process_results(results, target_metric_key, report_metric_keys, base_dim, table_format, output_dir):
+def process_results(results, target_metric_key, report_metric_keys, base_variation, table_format, output_dir):
 
     df = pd.DataFrame(results)
     variation_key = "variation"
     diff_key = "diff_%"
 
     sentinel_value = -1
-    if base_dim is not None and len(df[df.variation == base_dim]):
+    if base_variation is not None and len(df[df[variation_key] == base_variation]):
         # this may still return -1
-        sentinel_value = df.loc[df.variation == base_dim][target_metric_key]
+        sentinel_value = df.loc[df[variation_key] == base_variation][target_metric_key].item()
     if sentinel_value == -1:
         # as a fallback, use the minimal value as the sentinel
         sentinel_value = df.loc[df[target_metric_key] != -1][target_metric_key].min()
@@ -196,7 +288,7 @@ def process_results(results, target_metric_key, report_metric_keys, base_dim, ta
     # create diff column
     if sentinel_value != -1:
         df[diff_key] = df.apply(
-            lambda r: int(100 * r[target_metric_key] / sentinel_value) if r[target_metric_key] != -1 else "✘",
+            lambda r: round(100 * r[target_metric_key] / sentinel_value) if r[target_metric_key] != -1 else "✘",
             axis="columns",
         )
 
@@ -216,11 +308,9 @@ def process_results(results, target_metric_key, report_metric_keys, base_dim, ta
     linebreak = "<br>" if table_format == "github" else "\n"
     df = df.rename(lambda c: c.replace("_", linebreak), axis="columns")
 
-    print("\n*** Results:\n")
-    print(df.to_markdown(index=False))
-    print(f"\nNote: each run's output is also logged under {output_dir}/*.std*.txt")
-    print("\nPlease include the following information with your benchmark post:")
-    print(get_versions())
+    print("", "*** Results:", df.to_markdown(index=False), get_versions(), sep="\n\n")
+
+    print("The benchmark command line was:", get_orig_cmd(), sep="\n\n")
 
 
 def main():
@@ -233,18 +323,18 @@ def main():
         help="Base cmd",
     )
     parser.add_argument(
-        "--dims",
+        "--variations",
         default=None,
         type=str,
         nargs="+",
         required=True,
-        help="Dimension args",
+        help="Multi-dimensional variations, example: '|--fp16|--bf16' '|--tf32'",
     )
     parser.add_argument(
-        "--base-dim",
+        "--base-variation",
         default=None,
         type=str,
-        help="Dimension base line arg. if None the minimal value will be used to compare against",
+        help="Baseline variation to compare to. if None the minimal target value will be used to compare against",
     )
     parser.add_argument(
         "--target-metric-key",
@@ -263,7 +353,7 @@ def main():
         "--repeat-times",
         default=1,
         type=int,
-        help="How many times to re-run each combination - an average will be reported",
+        help="How many times to re-run each variation - an average will be reported",
     )
     # table_format_choices
     parser.add_argument(
@@ -285,26 +375,38 @@ def main():
     base_cmd = get_base_cmd(args, output_dir)
 
     # split each dimension into its --foo variations
-    dims = [list(map(str.strip, re.split(r"(?=--)", x)[1:])) for x in args.dims]
-    # build a cartesian product of dimensions and convert those back into cmd-line arg strings
-    opts = list(map(" ".join, itertools.product(*dims)))
+    dims = [list(map(str.strip, re.split(r"\|", x))) for x in args.variations]
+    # build a cartesian product of dimensions and convert those back into cmd-line arg strings,
+    # while stripping white space for inputs that were empty
+    variations = list(map(str.strip, map(" ".join, itertools.product(*dims))))
+    longest_variation_len = max(len(x) for x in variations)
 
     # split wanted keys
     report_metric_keys = args.report_metric_keys.split()
 
-    print(f"\n*** Running {len(opts)} benchmarks:")
+    # capture prints into a log file for convenience
+    report_fn = f"benchmark-report-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt"
+    print(f"\nNote: each run's output is also logged under {output_dir}/*.std*.txt")
+    print(f"and this script's output is also piped into {report_fn}")
+
+    sys.stdout = Tee(report_fn)
+
+    print(f"\n*** Running {len(variations)} benchmarks:")
     print(f"Base command: {' '.join(base_cmd)}")
 
-    opt_key = "variation"
+    # keys = {}
+
+    variation_key = "variation"
     results = []
-    for id, opt in enumerate(tqdm(opts, desc="Total completion: ", leave=False)):
-        cmd = base_cmd + opt.split()
+    for id, variation in enumerate(tqdm(variations, desc="Total completion: ", leave=False)):
+        cmd = base_cmd + variation.split()
         results.append(
             process_run(
                 id + 1,
                 cmd,
-                opt_key,
-                opt,
+                variation_key,
+                variation,
+                longest_variation_len,
                 args.target_metric_key,
                 report_metric_keys,
                 args.repeat_times,
@@ -313,7 +415,9 @@ def main():
             )
         )
 
-    process_results(results, args.target_metric_key, report_metric_keys, args.base_dim, args.table_format, output_dir)
+    process_results(
+        results, args.target_metric_key, report_metric_keys, args.base_variation, args.table_format, output_dir
+    )
 
 
 if __name__ == "__main__":

From 1c0f69fbfd4eb778d43302d713b8577b4c2373bc Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 29 Dec 2021 17:22:55 -0800
Subject: [PATCH 06/10] improved

---
 scripts/benchmark/trainer-benchmark.py | 152 +++++++++++++------------
 1 file changed, 80 insertions(+), 72 deletions(-)

diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index af527c59da88..62418516bbe4 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -4,6 +4,9 @@
 #
 # This tool can be used to run and compare multiple dimensions of the HF Trainers args
 #
+# It then prints a report once in github format with all the information that needs to be shared
+# with others and second time in a console-friendly format, so it's easier to use for tuning things up.
+#
 # The main idea is:
 #
 #     ./trainer-benchmark.py --base-cmd '<cmd args that don't change>' \
@@ -24,49 +27,54 @@
 #
 # and print the results. This is just a cartesian product - and more than 2 dimensions can be used.
 #
-# In a simpler way the same can be accomplished as:
-#
-#     --variations '|--tf32' '|--fp16|--bf16'
+# If you want to rely on defaults, this:
+#    --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1'
+# is identical to this:
+#    --variations '--tf32 0|--tf32 1' '|--fp16|--bf16'
 #
-# the leading empty variation is valid variation.
+# the leading empty variation in the 2nd dimension is a valid variation.
 #
 # So here we get the following 6 variations:
 #
-#    1.
-#    2. --fp16
-#    3. --bf16
-#    4. --tf32
-#    5. --tf32 --fp16
-#    6. --tf32 --bf16
+#    1. --tf32 0
+#    2. --tf32 0 --fp16
+#    3. --tf32 0 --bf16
+#    4. --tf32 1
+#    5. --tf32 1 --fp16
+#    6. --tf32 1 --bf16
 #
+# In this particular case we don't know what the default tf32 setting is as it's normally
+# pytorch-version dependent). That's why it's best to do an explicit setting of each variation:
+#    `--tf32 0|--tf32 1`
 #
 # Here is a full example of a train:
 #
-#     CUDA_VISIBLE_DEVICES=0 ./scripts/benchmark/trainer-benchmark.py \
-#     --base-cmd ' \
-#     examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --output_dir output_dir \
-#     --do_train --label_smoothing 0.1 --logging_strategy no --save_strategy no --per_device_train_batch_size 8 \
-#     --max_source_length 512 --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \
-#     --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \
-#     --source_prefix "translate English to Romanian: "  --warmup_steps 50 \
-#     --max_train_samples 5000 --dataloader_num_workers 2 \
-#     ' \
-#     --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \
-#     --base-variation '--tf32 0 --fp16 0' \
-#     --target-metric-key train_samples_per_second --repeat-times 1
+# CUDA_VISIBLE_DEVICES=0 python ./scripts/benchmark/trainer-benchmark.py \
+# --base-cmd \
+# ' examples/pytorch/translation/run_translation.py --model_name_or_path t5-small \
+# --output_dir output_dir --do_train --label_smoothing 0.1 --logging_strategy no \
+# --save_strategy no --per_device_train_batch_size 32 --max_source_length 512 \
+# --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \
+# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \
+# --source_prefix "translate English to Romanian: " --warmup_steps 50 \
+# --max_train_samples 20000 --dataloader_num_workers 2 ' \
+# --target-metric-key train_samples_per_second --repeat-times 1 --variations \
+# '|--fp16|--bf16' '--tf32 0|--tf32 1' --report-metric-keys train_loss \
+# --repeat-times 1 --base-variation '--tf32 0'
 #
 # and here is a possible output:
-# # XXX: outdated!
-#     *** Results: train_samples_per_second
 #
-#     |    Variations     | Result |   %   |
-#     | ----------------- | ------ | ----- |
-#     | --tf32 0 --fp16 0 |  31.95 |  100% |
-#     | --tf32 0 --fp16 1 |  47.88 |  149% |
-#     | --tf32 0 --bf16 1 |  35.04 |  109% |
-#     | --tf32 1 --fp16 0 |  35.47 |  111% |
-#     | --tf32 1 --fp16 1 |  47.82 |  149% |
-#     | --tf32 1 --bf16 1 |  35.11 |  109% |
+#    | Variation       |     Train |   Diff |   Train |
+#    |                 |   samples |      % |    loss |
+#    |                 |       per |        |         |
+#    |                 |    second |        |         |
+#    |:----------------|----------:|-------:|--------:|
+#    | --tf32 0        |    286.07 |    100 |    2.51 |
+#    | --tf32 1        |    342.82 |    120 |    2.51 |
+#    | --fp16 --tf32 0 |    422.07 |    148 |    2.51 |
+#    | --fp16 --tf32 1 |    423.18 |    148 |    2.51 |
+#    | --bf16 --tf32 0 |    415.93 |    145 |    2.52 |
+#    | --bf16 --tf32 1 |    418.51 |    146 |    2.52 |
 #
 # So you can quickly compare the different outcomes.
 #
@@ -78,15 +86,18 @@
 # the baseline, e.g., to change to another entry use: --base-variation '--tf32 1 --fp16 0'
 #
 # --target-metric-key is there to tell the program which metrics to compare - the different metric keys are
-# inside output_dir/all_results.json. e.g., to measure eval performance instead of train use
-# --target-metric-key eval_samples_per_second
-
+# inside output_dir/all_results.json. e.g., to measure eval performance instead of train use:
+#    --target-metric-key eval_samples_per_second
+# but of course you will need to adjust the --base-cmd value in the example to perform evaluation as
+# well (as currently it doesn't)
+#
 
 import argparse
 import datetime
 import io
 import itertools
 import json
+import math
 import os
 import platform
 import re
@@ -103,6 +114,9 @@
 import transformers
 
 
+nan = float("nan")
+
+
 class Tee:
     """
     A helper class to tee print's output into a file.
@@ -178,12 +192,12 @@ def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric
     # enable to debug everything but the run itself, to do it fast and see the progress
     if 0:
         import random
-        from random import randint
         from time import sleep
 
         sleep(0)
         return dict(
-            {k: randint(1, 30) for k in metric_keys}, **{target_metric_key: random.choice([-1, 10, 100, 55, 222])}
+            {k: random.uniform(0, 100) for k in metric_keys},
+            **{target_metric_key: random.choice([nan, 10.31, 100.2, 55.6666, 222.22222222])},
         )
 
     result = subprocess.run(cmd, capture_output=True, text=True)
@@ -202,7 +216,7 @@ def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric
     if result.returncode != 0:
         if verbose:
             print("failed")
-        return {target_metric_key: -1}
+        return {target_metric_key: nan}
 
     with io.open(f"{output_dir}/all_results.json", "r", encoding="utf-8") as f:
         metrics = json.load(f)
@@ -233,26 +247,25 @@ def process_run(
             id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose
         )
         result = single_run_metrics[target_metric_key]
-        if result != -1:
+        if not math.isnan(result):
             metrics.append(single_run_metrics)
             results.append(result)
             outcome += "✓"
         else:
             outcome += "✘"
     outcome = f"\33[2K\r{outcome}"
-    successful_runs = len(metrics)
-    if successful_runs > 0:
-        mean_metrics = {k: fmean([metrics[i][k] for i in range(successful_runs)]) for k in metrics[0].keys()}
+    if len(metrics) > 0:
+        mean_metrics = {k: fmean([x[k] for x in metrics]) for k in metrics[0].keys()}
         mean_target = round(mean_metrics[target_metric_key], 2)
         results_str = f"{outcome} {mean_target}"
-        if successful_runs > 1:
+        if len(metrics) > 1:
             results_str += f" {tuple(round(x, 2) for x in results)}"
         print(results_str)
         mean_metrics[variation_key] = variation
         return mean_metrics
     else:
         print(outcome)
-        return {variation_key: variation, target_metric_key: -1}
+        return {variation_key: variation, target_metric_key: nan}
 
 
 def get_versions():
@@ -271,32 +284,29 @@ def get_versions():
 """
 
 
-def process_results(results, target_metric_key, report_metric_keys, base_variation, table_format, output_dir):
+def process_results(results, target_metric_key, report_metric_keys, base_variation, output_dir):
 
     df = pd.DataFrame(results)
     variation_key = "variation"
     diff_key = "diff_%"
 
-    sentinel_value = -1
+    sentinel_value = nan
     if base_variation is not None and len(df[df[variation_key] == base_variation]):
-        # this may still return -1
+        # this may still return nan
         sentinel_value = df.loc[df[variation_key] == base_variation][target_metric_key].item()
-    if sentinel_value == -1:
+    if math.isnan(sentinel_value):
         # as a fallback, use the minimal value as the sentinel
-        sentinel_value = df.loc[df[target_metric_key] != -1][target_metric_key].min()
+        sentinel_value = df.loc[df[target_metric_key] != nan][target_metric_key].min()
 
-    # create diff column
-    if sentinel_value != -1:
+    # create diff column if possible
+    if not math.isnan(sentinel_value):
         df[diff_key] = df.apply(
-            lambda r: round(100 * r[target_metric_key] / sentinel_value) if r[target_metric_key] != -1 else "✘",
+            lambda r: round(100 * r[target_metric_key] / sentinel_value)
+            if not math.isnan(r[target_metric_key])
+            else 0,
             axis="columns",
         )
 
-    # deal with failed runs
-    df[target_metric_key] = df.apply(
-        lambda r: r[target_metric_key] if r[target_metric_key] != -1 else "✘", axis="columns"
-    )
-
     # re-order columns
     cols = [variation_key, target_metric_key, diff_key, *report_metric_keys]
     df = df.reindex(cols, axis="columns")  # reorder cols
@@ -305,12 +315,20 @@ def process_results(results, target_metric_key, report_metric_keys, base_variati
     df = df.rename(str.capitalize, axis="columns")
 
     # make the cols as narrow as possible
-    linebreak = "<br>" if table_format == "github" else "\n"
-    df = df.rename(lambda c: c.replace("_", linebreak), axis="columns")
+    df_github = df.rename(lambda c: c.replace("_", "<br>"), axis="columns")
+    df_console = df.rename(lambda c: c.replace("_", "\n"), axis="columns")
 
-    print("", "*** Results:", df.to_markdown(index=False), get_versions(), sep="\n\n")
+    report = ["", "Copy between the cut-here-lines and paste as is to github or a forum"]
+    report += ["----------8<-----------------8<--------"]
+    report += ["*** Results:", df_github.to_markdown(index=False, floatfmt=".2f")]
+    report += ["```"]
+    report += ["*** Setup:", get_versions()]
+    report += ["*** The benchmark command line was:", get_orig_cmd()]
+    report += ["```"]
+    report += ["----------8<-----------------8<--------"]
+    report += ["*** Results (console):", df_console.to_markdown(index=False, floatfmt=".2f")]
 
-    print("The benchmark command line was:", get_orig_cmd(), sep="\n\n")
+    print("\n\n".join(report))
 
 
 def main():
@@ -355,14 +373,6 @@ def main():
         type=int,
         help="How many times to re-run each variation - an average will be reported",
     )
-    # table_format_choices
-    parser.add_argument(
-        "--table-format",
-        default="console",
-        type=str,
-        choices=["github", "console"],
-        help="Format the results table to render best in the destination use",
-    )
     parser.add_argument(
         "--verbose",
         default=False,
@@ -415,9 +425,7 @@ def main():
             )
         )
 
-    process_results(
-        results, args.target_metric_key, report_metric_keys, args.base_variation, args.table_format, output_dir
-    )
+    process_results(results, args.target_metric_key, report_metric_keys, args.base_variation, output_dir)
 
 
 if __name__ == "__main__":

From bde5de4c9a52476860acd7da5834f7c1dd131220 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 3 Jan 2022 14:20:47 -0800
Subject: [PATCH 07/10] fix prefix

---
 scripts/benchmark/trainer-benchmark.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index 62418516bbe4..a54fd002364c 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -208,9 +208,9 @@ def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric
 
     # save the streams
     prefix = variation.replace(" ", "-")
-    with open(Path(output_dir) / f"{prefix}.stdout.txt", "w") as f:
+    with open(Path(output_dir) / f"log.{prefix}.stdout.txt", "w") as f:
         f.write(result.stdout)
-    with open(Path(output_dir) / f"{prefix}.stderr.txt", "w") as f:
+    with open(Path(output_dir) / f"log.{prefix}.stderr.txt", "w") as f:
         f.write(result.stderr)
 
     if result.returncode != 0:
@@ -396,7 +396,7 @@ def main():
 
     # capture prints into a log file for convenience
     report_fn = f"benchmark-report-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt"
-    print(f"\nNote: each run's output is also logged under {output_dir}/*.std*.txt")
+    print(f"\nNote: each run's output is also logged under {output_dir}/log.*.std*.txt")
     print(f"and this script's output is also piped into {report_fn}")
 
     sys.stdout = Tee(report_fn)

From 968d86f81a248352c43ce77c98d1cfc73b660ac9 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 3 Jan 2022 21:14:13 -0800
Subject: [PATCH 08/10] fix

---
 scripts/benchmark/trainer-benchmark.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index a54fd002364c..4fa268794855 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -382,6 +382,7 @@ def main():
     args = parser.parse_args()
 
     output_dir = "output_benchmark"
+    Path(output_dir).mkdir(exist_ok=True)
     base_cmd = get_base_cmd(args, output_dir)
 
     # split each dimension into its --foo variations

From b9e2a12cc7d1acf6c46b488e46cec3107fd328e5 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 4 Jan 2022 12:54:09 -0800
Subject: [PATCH 09/10] fix diff calculation

---
 scripts/benchmark/trainer-benchmark.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index 4fa268794855..3188290a9dd4 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -64,17 +64,19 @@
 #
 # and here is a possible output:
 #
-#    | Variation       |     Train |   Diff |   Train |
-#    |                 |   samples |      % |    loss |
-#    |                 |       per |        |         |
-#    |                 |    second |        |         |
-#    |:----------------|----------:|-------:|--------:|
-#    | --tf32 0        |    286.07 |    100 |    2.51 |
-#    | --tf32 1        |    342.82 |    120 |    2.51 |
-#    | --fp16 --tf32 0 |    422.07 |    148 |    2.51 |
-#    | --fp16 --tf32 1 |    423.18 |    148 |    2.51 |
-#    | --bf16 --tf32 0 |    415.93 |    145 |    2.52 |
-#    | --bf16 --tf32 1 |    418.51 |    146 |    2.52 |
+#
+# | Variation       |     Train |   Diff |   Train |
+# |                 |   samples |      % |    loss |
+# |                 |       per |        |         |
+# |                 |    second |        |         |
+# |:----------------|----------:|-------:|--------:|
+# | --tf32 0        |    285.11 |      0 |    2.51 |
+# | --tf32 1        |    342.09 |     20 |    2.51 |
+# | --fp16 --tf32 0 |    423.49 |     49 |    2.51 |
+# | --fp16 --tf32 1 |    423.13 |     48 |    2.51 |
+# | --bf16 --tf32 0 |    416.80 |     46 |    2.52 |
+# | --bf16 --tf32 1 |    415.87 |     46 |    2.52 |
+#
 #
 # So you can quickly compare the different outcomes.
 #
@@ -301,7 +303,7 @@ def process_results(results, target_metric_key, report_metric_keys, base_variati
     # create diff column if possible
     if not math.isnan(sentinel_value):
         df[diff_key] = df.apply(
-            lambda r: round(100 * r[target_metric_key] / sentinel_value)
+            lambda r: round(100 * (r[target_metric_key] - sentinel_value) / sentinel_value)
             if not math.isnan(r[target_metric_key])
             else 0,
             axis="columns",

From 6a875ea8b2511a54a2105bcb78426012c29021ea Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 15 Feb 2022 17:20:44 -0800
Subject: [PATCH 10/10] address suggestions

---
 scripts/benchmark/trainer-benchmark.py | 37 +++++++++++++++++---------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index 3188290a9dd4..903b4e0dd6d5 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -2,7 +2,7 @@
 
 # HF Trainer benchmarking tool
 #
-# This tool can be used to run and compare multiple dimensions of the HF Trainers args
+# This tool can be used to run and compare multiple dimensions of the HF Trainers args.
 #
 # It then prints a report once in github format with all the information that needs to be shared
 # with others and second time in a console-friendly format, so it's easier to use for tuning things up.
@@ -13,6 +13,9 @@
 #     --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \
 #     --target-metric-key train_samples_per_second
 #
+# The variations can be any command line argument that you want to compare and not just dtype as in
+# the example.
+#
 # --variations allows you to compare variations in multiple dimensions.
 #
 # as the first dimention has 2 options and the second 3 in our example, this will run the trainer 6
@@ -139,12 +142,15 @@ def write(self, msg):
         self.file.write(re.sub(r"^.*\r", "", msg, 0, re.M))
 
 
-def get_orig_cmd(max_width=80, full_python_path=False):
+def get_original_command(max_width=80, full_python_path=False):
     """
-    Return the original command line string that can be replayed nicely and wrapped for 80 char width
+    Return the original command line string that can be replayed nicely and wrapped for 80 char width.
+
     Args:
-        - max_width: the width to wrap for. defaults to 80
-        - full_python_path: whether to replicate the full path or just the last part (i.e. `python`). default to `False`
+        max_width (`int`, `optional`, defaults to 80):
+            The width to wrap for.
+        full_python_path (`bool`, `optional`, defaults to `False`):
+             Whether to replicate the full path or just the last segment (i.e. `python`).
     """
 
     cmd = []
@@ -174,7 +180,7 @@ def get_orig_cmd(max_width=80, full_python_path=False):
     return "\\\n".join(lines)
 
 
-def get_base_cmd(args, output_dir):
+def get_base_command(args, output_dir):
 
     # unwrap multi-line input
     args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd)
@@ -191,7 +197,10 @@ def get_base_cmd(args, output_dir):
 
 
 def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose):
-    # enable to debug everything but the run itself, to do it fast and see the progress
+
+    # Enable to debug everything but the run itself, to do it fast and see the progress.
+    # This is useful for debugging the output formatting quickly - we can remove it later once
+    # everybody is happy with the output
     if 0:
         import random
         from time import sleep
@@ -325,7 +334,7 @@ def process_results(results, target_metric_key, report_metric_keys, base_variati
     report += ["*** Results:", df_github.to_markdown(index=False, floatfmt=".2f")]
     report += ["```"]
     report += ["*** Setup:", get_versions()]
-    report += ["*** The benchmark command line was:", get_orig_cmd()]
+    report += ["*** The benchmark command line was:", get_original_command()]
     report += ["```"]
     report += ["----------8<-----------------8<--------"]
     report += ["*** Results (console):", df_console.to_markdown(index=False, floatfmt=".2f")]
@@ -375,6 +384,12 @@ def main():
         type=int,
         help="How many times to re-run each variation - an average will be reported",
     )
+    parser.add_argument(
+        "--output_dir",
+        default="output_benchmark",
+        type=str,
+        help="The output directory where all the benchmark reports will go to and additionally this directory will be used to override --output_dir in the script that is being benchmarked",
+    )
     parser.add_argument(
         "--verbose",
         default=False,
@@ -383,9 +398,9 @@ def main():
     )
     args = parser.parse_args()
 
-    output_dir = "output_benchmark"
+    output_dir = args.output_dir
     Path(output_dir).mkdir(exist_ok=True)
-    base_cmd = get_base_cmd(args, output_dir)
+    base_cmd = get_base_command(args, output_dir)
 
     # split each dimension into its --foo variations
     dims = [list(map(str.strip, re.split(r"\|", x))) for x in args.variations]
@@ -407,8 +422,6 @@ def main():
     print(f"\n*** Running {len(variations)} benchmarks:")
     print(f"Base command: {' '.join(base_cmd)}")
 
-    # keys = {}
-
     variation_key = "variation"
     results = []
     for id, variation in enumerate(tqdm(variations, desc="Total completion: ", leave=False)):