diff --git a/benchmarks/sizing/README.md b/benchmarks/sizing/README.md index 0cc4e22..fa68c6a 100644 --- a/benchmarks/sizing/README.md +++ b/benchmarks/sizing/README.md @@ -26,6 +26,7 @@ Example for mm_flops.py: python mm_flops.py -m 1024 -k 1024 -n 1024 2048 Example for mm_flops.py with range option: python mm_flops.py -m 1024 -k 1024 --n_range 1024 2048 256 usage: mm_flops.py [-h] (-m M [M ...] | --m_range M_RANGE [M_RANGE ...]) (-n [N ...] | --n_range N_RANGE [N_RANGE ...])(-k [K ...] | --k_range K_RANGE [K_RANGE ...]) [--num_iterations NUM_ITERATIONS] [--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE] [--output_file OUTPUT_FILE] +[--notes NOTES] [--verbose | --no-verbose] options: -h, --help show this help message and exit @@ -45,6 +46,7 @@ options: --cuda_device CUDA_DEVICE The cuda device to run the benchmark on --output_file OUTPUT_FILE + --notes NOTES benchmark-specific notes to add to the output_file's header --verbose, --no-verbose log to stdout besides output_file? (default: True) ``` @@ -55,6 +57,7 @@ options: Example for bmm_flops.py: python bmm_flops.py -m 1024 -k 1024 -n 1024 2048 -b 128 usage: bmm_flops.py [-h] (-b B [B ...] | --b_range B_RANGE [B_RANGE ...]) (-m M [M ...] | --m_range M_RANGE [M_RANGE ...])(-n [N ...] | --n_range N_RANGE [N_RANGE ...]) (-k [K ...] | --k_range K_RANGE [K_RANGE ...]) [--num_iterations NUM_ITERATIONS] [--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE][--output_file OUTPUT_FILE] +[--notes NOTES] [--verbose | --no-verbose] options: -h, --help show this help message and exit @@ -77,6 +80,7 @@ options: --cuda_device CUDA_DEVICE The cuda device to run the benchmark on --output_file OUTPUT_FILE + --notes NOTES benchmark-specific notes to add to the output_file's header --verbose, --no-verbose log to stdout besides output_file? (default: True) ``` @@ -97,6 +101,7 @@ usage: transformer_flops.py [-h] (--tensor_mp_size TENSOR_MP_SIZE [TENSOR_MP_SIZE ...] | --tensor_mp_size_range TENSOR_MP_SIZE_RANGE [TENSOR_MP_SIZE_RANGE ...]) [--blocks BLOCKS [BLOCKS ...]] [--use_flash] [--num_iterations NUM_ITERATIONS] [--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE] [--output_file OUTPUT_FILE] + [--notes NOTES] [--verbose | --no-verbose] options: -h, --help show this help message and exit @@ -140,6 +145,7 @@ options: --cuda_device CUDA_DEVICE The cuda device to run the benchmark on --output_file OUTPUT_FILE + --notes NOTES benchmark-specific notes to add to the output_file's header --verbose, --no-verbose log to stdout besides output_file? (default: True) ``` diff --git a/benchmarks/sizing/bmm_flops.py b/benchmarks/sizing/bmm_flops.py index 78ede8b..add548e 100644 --- a/benchmarks/sizing/bmm_flops.py +++ b/benchmarks/sizing/bmm_flops.py @@ -5,7 +5,7 @@ import argparse import os -from utils import Tee, benchmark_bmm +from utils import Tee, benchmark_bmm, print_benchmark_header file_dir = os.path.abspath(os.path.dirname(__file__)) @@ -30,6 +30,7 @@ parser.add_argument("--num_iterations", type=int, default=200, help='The number of iterations used to benchmark each BMM') parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations') parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on") + parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header") parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/bmm.out") parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?') args = parser.parse_args() @@ -56,6 +57,7 @@ torch.cuda.set_device(f"cuda:{args.cuda_device}") sys.stdout = Tee(args.output_file, args.verbose) + print_benchmark_header(args.notes) # loop through all sizes to benchmark for B in b: diff --git a/benchmarks/sizing/mm_flops.py b/benchmarks/sizing/mm_flops.py index 070a2ed..d57942a 100644 --- a/benchmarks/sizing/mm_flops.py +++ b/benchmarks/sizing/mm_flops.py @@ -5,7 +5,7 @@ import argparse import os -from utils import Tee, benchmark_mm +from utils import Tee, benchmark_mm, print_benchmark_header file_dir = os.path.abspath(os.path.dirname(__file__)) @@ -27,6 +27,7 @@ parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations') parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on") parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/mm.out") + parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header") parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?') args = parser.parse_args() @@ -48,6 +49,7 @@ torch.cuda.set_device(f"cuda:{args.cuda_device}") sys.stdout = Tee(args.output_file, args.verbose) + print_benchmark_header(args.notes) # loop through all sizes to benchmark for M in m: diff --git a/benchmarks/sizing/transformer_flops.py b/benchmarks/sizing/transformer_flops.py index 24a0e8f..47a86c5 100644 --- a/benchmarks/sizing/transformer_flops.py +++ b/benchmarks/sizing/transformer_flops.py @@ -235,6 +235,7 @@ def benchmark_transformer(c_args,configuration, seq_length, global_batch_size, n parser.add_argument("--num_iterations", type=int, default=200, help='The number of iterations used to benchmark each BMM') parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations') parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on") + parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header") parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/mm.out") parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?') args = parser.parse_args() @@ -272,6 +273,7 @@ def benchmark_transformer(c_args,configuration, seq_length, global_batch_size, n torch.cuda.set_device(f"cuda:{args.cuda_device}") sys.stdout = Tee(args.output_file, args.verbose) + print_benchmark_header(args.notes) configurations = [] for train_batch_size in global_batch_size: diff --git a/benchmarks/sizing/utils.py b/benchmarks/sizing/utils.py index 2d7edc3..70e197b 100644 --- a/benchmarks/sizing/utils.py +++ b/benchmarks/sizing/utils.py @@ -1,4 +1,7 @@ +import platform import sys +import shlex +import time import torch import numpy as np from pathlib import Path @@ -10,6 +13,27 @@ from megatron.model.gpt2_model import gpt2_attention_mask_func as attention_mask_func from megatron.model.word_embeddings import Embedding +def print_benchmark_header(notes="None"): + + print(f""" +Benchmark started on {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} + +** Command line: +{sys.executable} {" ".join(map(shlex.quote, sys.argv))} + +** Platform: +{" ".join(platform.uname())} + +** Critical component versions: +torch={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()} + +** Additional notes: +{notes} + +{"-" * 80} + +""") + class Tee(object): def __init__(self, filename, verbose): Path(filename).resolve().parent.mkdir(parents=True, exist_ok=True)