diff --git a/benchmarks/sizing/README.md b/benchmarks/sizing/README.md
index 0cc4e22..fa68c6a 100644
--- a/benchmarks/sizing/README.md
+++ b/benchmarks/sizing/README.md
@@ -26,6 +26,7 @@ Example for mm_flops.py: python mm_flops.py -m 1024 -k 1024 -n 1024 2048
 Example for mm_flops.py with range option: python mm_flops.py -m 1024 -k 1024 --n_range 1024 2048 256
 usage: mm_flops.py [-h] (-m M [M ...] | --m_range M_RANGE [M_RANGE ...]) (-n [N ...] | --n_range N_RANGE [N_RANGE ...])(-k [K ...] | --k_range K_RANGE [K_RANGE ...]) [--num_iterations NUM_ITERATIONS]
 [--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE] [--output_file OUTPUT_FILE]
+[--notes NOTES] [--verbose | --no-verbose]
 
 options:
   -h, --help            show this help message and exit
@@ -45,6 +46,7 @@ options:
   --cuda_device CUDA_DEVICE
                         The cuda device to run the benchmark on
   --output_file OUTPUT_FILE
+  --notes NOTES         benchmark-specific notes to add to the output_file's header
   --verbose, --no-verbose
                         log to stdout besides output_file? (default: True)
 ```
@@ -55,6 +57,7 @@ options:
 Example for bmm_flops.py: python bmm_flops.py -m 1024 -k 1024 -n 1024 2048 -b 128
 usage: bmm_flops.py [-h] (-b B [B ...] | --b_range B_RANGE [B_RANGE ...]) (-m M [M ...] | --m_range M_RANGE [M_RANGE ...])(-n [N ...] | --n_range N_RANGE [N_RANGE ...]) (-k [K ...] | --k_range K_RANGE [K_RANGE ...])
 [--num_iterations NUM_ITERATIONS] [--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE][--output_file OUTPUT_FILE]
+[--notes NOTES] [--verbose | --no-verbose]
 
 options:
   -h, --help            show this help message and exit
@@ -77,6 +80,7 @@ options:
   --cuda_device CUDA_DEVICE
                         The cuda device to run the benchmark on
   --output_file OUTPUT_FILE
+  --notes NOTES         benchmark-specific notes to add to the output_file's header
   --verbose, --no-verbose
                         log to stdout besides output_file? (default: True)
 ```
@@ -97,6 +101,7 @@ usage: transformer_flops.py [-h]
                             (--tensor_mp_size TENSOR_MP_SIZE [TENSOR_MP_SIZE ...] | --tensor_mp_size_range TENSOR_MP_SIZE_RANGE [TENSOR_MP_SIZE_RANGE ...])
                             [--blocks BLOCKS [BLOCKS ...]] [--use_flash] [--num_iterations NUM_ITERATIONS]
                             [--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE] [--output_file OUTPUT_FILE]
+                            [--notes NOTES] [--verbose | --no-verbose]
 
 options:
   -h, --help            show this help message and exit
@@ -140,6 +145,7 @@ options:
   --cuda_device CUDA_DEVICE
                         The cuda device to run the benchmark on
   --output_file OUTPUT_FILE
+  --notes NOTES         benchmark-specific notes to add to the output_file's header
   --verbose, --no-verbose
                         log to stdout besides output_file? (default: True)
 ```
diff --git a/benchmarks/sizing/bmm_flops.py b/benchmarks/sizing/bmm_flops.py
index 78ede8b..add548e 100644
--- a/benchmarks/sizing/bmm_flops.py
+++ b/benchmarks/sizing/bmm_flops.py
@@ -5,7 +5,7 @@
 import argparse
 import os
 
-from utils import Tee, benchmark_bmm
+from utils import Tee, benchmark_bmm, print_benchmark_header
 
 file_dir = os.path.abspath(os.path.dirname(__file__))
 
@@ -30,6 +30,7 @@
     parser.add_argument("--num_iterations", type=int, default=200, help='The number of iterations used to benchmark each BMM')
     parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
     parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
+    parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header")
     parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/bmm.out")
     parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?')
     args = parser.parse_args()
@@ -56,6 +57,7 @@
     torch.cuda.set_device(f"cuda:{args.cuda_device}")
 
     sys.stdout = Tee(args.output_file, args.verbose)
+    print_benchmark_header(args.notes)
 
     # loop through all sizes to benchmark
     for B in b:
diff --git a/benchmarks/sizing/mm_flops.py b/benchmarks/sizing/mm_flops.py
index 070a2ed..d57942a 100644
--- a/benchmarks/sizing/mm_flops.py
+++ b/benchmarks/sizing/mm_flops.py
@@ -5,7 +5,7 @@
 import argparse
 import os
 
-from utils import Tee, benchmark_mm
+from utils import Tee, benchmark_mm, print_benchmark_header
 
 file_dir = os.path.abspath(os.path.dirname(__file__))
 
@@ -27,6 +27,7 @@
     parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
     parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
     parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/mm.out")
+    parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header")
     parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?')
     args = parser.parse_args()
 
@@ -48,6 +49,7 @@
     torch.cuda.set_device(f"cuda:{args.cuda_device}")
 
     sys.stdout = Tee(args.output_file, args.verbose)
+    print_benchmark_header(args.notes)
 
     # loop through all sizes to benchmark
     for M in m:
diff --git a/benchmarks/sizing/transformer_flops.py b/benchmarks/sizing/transformer_flops.py
index 24a0e8f..47a86c5 100644
--- a/benchmarks/sizing/transformer_flops.py
+++ b/benchmarks/sizing/transformer_flops.py
@@ -235,6 +235,7 @@ def benchmark_transformer(c_args,configuration, seq_length, global_batch_size, n
     parser.add_argument("--num_iterations", type=int, default=200, help='The number of iterations used to benchmark each BMM')
     parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
     parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
+    parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header")
     parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/mm.out")
     parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?')
     args = parser.parse_args()
@@ -272,6 +273,7 @@ def benchmark_transformer(c_args,configuration, seq_length, global_batch_size, n
     torch.cuda.set_device(f"cuda:{args.cuda_device}")
 
     sys.stdout = Tee(args.output_file, args.verbose)
+    print_benchmark_header(args.notes)
 
     configurations = []
     for train_batch_size in global_batch_size:
diff --git a/benchmarks/sizing/utils.py b/benchmarks/sizing/utils.py
index 2d7edc3..70e197b 100644
--- a/benchmarks/sizing/utils.py
+++ b/benchmarks/sizing/utils.py
@@ -1,4 +1,7 @@
+import platform
 import sys
+import shlex
+import time
 import torch
 import numpy as np
 from pathlib import Path
@@ -10,6 +13,27 @@
 from megatron.model.gpt2_model import gpt2_attention_mask_func as attention_mask_func
 from megatron.model.word_embeddings import Embedding
 
+def print_benchmark_header(notes="None"):
+    
+    print(f"""
+Benchmark started on {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}
+
+** Command line:
+{sys.executable} {" ".join(map(shlex.quote, sys.argv))}
+
+** Platform:
+{" ".join(platform.uname())}
+
+** Critical component versions:
+torch={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}
+
+** Additional notes: 
+{notes}
+
+{"-" * 80}
+
+""")
+
 class Tee(object):
     def __init__(self, filename, verbose):
         Path(filename).resolve().parent.mkdir(parents=True, exist_ok=True)