Skip to content

Commit

Permalink
self-document the benchmark's setup (#29)
Browse files Browse the repository at this point in the history
* self-document the benchmark

* fix

* add platform
  • Loading branch information
stas00 authored Feb 19, 2024
1 parent 6218607 commit 939fa3c
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 2 deletions.
6 changes: 6 additions & 0 deletions benchmarks/sizing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Example for mm_flops.py: python mm_flops.py -m 1024 -k 1024 -n 1024 2048
Example for mm_flops.py with range option: python mm_flops.py -m 1024 -k 1024 --n_range 1024 2048 256
usage: mm_flops.py [-h] (-m M [M ...] | --m_range M_RANGE [M_RANGE ...]) (-n [N ...] | --n_range N_RANGE [N_RANGE ...])(-k [K ...] | --k_range K_RANGE [K_RANGE ...]) [--num_iterations NUM_ITERATIONS]
[--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE] [--output_file OUTPUT_FILE]
[--notes NOTES] [--verbose | --no-verbose]
options:
-h, --help show this help message and exit
Expand All @@ -45,6 +46,7 @@ options:
--cuda_device CUDA_DEVICE
The cuda device to run the benchmark on
--output_file OUTPUT_FILE
--notes NOTES benchmark-specific notes to add to the output_file's header
--verbose, --no-verbose
log to stdout besides output_file? (default: True)
```
Expand All @@ -55,6 +57,7 @@ options:
Example for bmm_flops.py: python bmm_flops.py -m 1024 -k 1024 -n 1024 2048 -b 128
usage: bmm_flops.py [-h] (-b B [B ...] | --b_range B_RANGE [B_RANGE ...]) (-m M [M ...] | --m_range M_RANGE [M_RANGE ...])(-n [N ...] | --n_range N_RANGE [N_RANGE ...]) (-k [K ...] | --k_range K_RANGE [K_RANGE ...])
[--num_iterations NUM_ITERATIONS] [--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE][--output_file OUTPUT_FILE]
[--notes NOTES] [--verbose | --no-verbose]
options:
-h, --help show this help message and exit
Expand All @@ -77,6 +80,7 @@ options:
--cuda_device CUDA_DEVICE
The cuda device to run the benchmark on
--output_file OUTPUT_FILE
--notes NOTES benchmark-specific notes to add to the output_file's header
--verbose, --no-verbose
log to stdout besides output_file? (default: True)
```
Expand All @@ -97,6 +101,7 @@ usage: transformer_flops.py [-h]
(--tensor_mp_size TENSOR_MP_SIZE [TENSOR_MP_SIZE ...] | --tensor_mp_size_range TENSOR_MP_SIZE_RANGE [TENSOR_MP_SIZE_RANGE ...])
[--blocks BLOCKS [BLOCKS ...]] [--use_flash] [--num_iterations NUM_ITERATIONS]
[--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE] [--output_file OUTPUT_FILE]
[--notes NOTES] [--verbose | --no-verbose]
options:
-h, --help show this help message and exit
Expand Down Expand Up @@ -140,6 +145,7 @@ options:
--cuda_device CUDA_DEVICE
The cuda device to run the benchmark on
--output_file OUTPUT_FILE
--notes NOTES benchmark-specific notes to add to the output_file's header
--verbose, --no-verbose
log to stdout besides output_file? (default: True)
```
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/sizing/bmm_flops.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import argparse
import os

from utils import Tee, benchmark_bmm
from utils import Tee, benchmark_bmm, print_benchmark_header

file_dir = os.path.abspath(os.path.dirname(__file__))

Expand All @@ -30,6 +30,7 @@
parser.add_argument("--num_iterations", type=int, default=200, help='The number of iterations used to benchmark each BMM')
parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header")
parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/bmm.out")
parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?')
args = parser.parse_args()
Expand All @@ -56,6 +57,7 @@
torch.cuda.set_device(f"cuda:{args.cuda_device}")

sys.stdout = Tee(args.output_file, args.verbose)
print_benchmark_header(args.notes)

# loop through all sizes to benchmark
for B in b:
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/sizing/mm_flops.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import argparse
import os

from utils import Tee, benchmark_mm
from utils import Tee, benchmark_mm, print_benchmark_header

file_dir = os.path.abspath(os.path.dirname(__file__))

Expand All @@ -27,6 +27,7 @@
parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/mm.out")
parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header")
parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?')
args = parser.parse_args()

Expand All @@ -48,6 +49,7 @@
torch.cuda.set_device(f"cuda:{args.cuda_device}")

sys.stdout = Tee(args.output_file, args.verbose)
print_benchmark_header(args.notes)

# loop through all sizes to benchmark
for M in m:
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/sizing/transformer_flops.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ def benchmark_transformer(c_args,configuration, seq_length, global_batch_size, n
parser.add_argument("--num_iterations", type=int, default=200, help='The number of iterations used to benchmark each BMM')
parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header")
parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/mm.out")
parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?')
args = parser.parse_args()
Expand Down Expand Up @@ -272,6 +273,7 @@ def benchmark_transformer(c_args,configuration, seq_length, global_batch_size, n
torch.cuda.set_device(f"cuda:{args.cuda_device}")

sys.stdout = Tee(args.output_file, args.verbose)
print_benchmark_header(args.notes)

configurations = []
for train_batch_size in global_batch_size:
Expand Down
24 changes: 24 additions & 0 deletions benchmarks/sizing/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import platform
import sys
import shlex
import time
import torch
import numpy as np
from pathlib import Path
Expand All @@ -10,6 +13,27 @@
from megatron.model.gpt2_model import gpt2_attention_mask_func as attention_mask_func
from megatron.model.word_embeddings import Embedding

def print_benchmark_header(notes="None"):

print(f"""
Benchmark started on {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}
** Command line:
{sys.executable} {" ".join(map(shlex.quote, sys.argv))}
** Platform:
{" ".join(platform.uname())}
** Critical component versions:
torch={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}
** Additional notes:
{notes}
{"-" * 80}
""")

class Tee(object):
def __init__(self, filename, verbose):
Path(filename).resolve().parent.mkdir(parents=True, exist_ok=True)
Expand Down

0 comments on commit 939fa3c

Please sign in to comment.