Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

self-document the benchmark's setup #29

Merged
merged 3 commits into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions benchmarks/sizing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Example for mm_flops.py: python mm_flops.py -m 1024 -k 1024 -n 1024 2048
Example for mm_flops.py with range option: python mm_flops.py -m 1024 -k 1024 --n_range 1024 2048 256
usage: mm_flops.py [-h] (-m M [M ...] | --m_range M_RANGE [M_RANGE ...]) (-n [N ...] | --n_range N_RANGE [N_RANGE ...])(-k [K ...] | --k_range K_RANGE [K_RANGE ...]) [--num_iterations NUM_ITERATIONS]
[--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE] [--output_file OUTPUT_FILE]
[--notes NOTES] [--verbose | --no-verbose]

options:
-h, --help show this help message and exit
Expand All @@ -40,6 +41,7 @@ options:
--cuda_device CUDA_DEVICE
The cuda device to run the benchmark on
--output_file OUTPUT_FILE
--notes NOTES benchmark-specific notes to add to the output_file's header
--verbose, --no-verbose
log to stdout besides output_file? (default: True)
```
Expand All @@ -50,6 +52,7 @@ options:
Example for bmm_flops.py: python bmm_flops.py -m 1024 -k 1024 -n 1024 2048 -b 128
usage: bmm_flops.py [-h] (-b B [B ...] | --b_range B_RANGE [B_RANGE ...]) (-m M [M ...] | --m_range M_RANGE [M_RANGE ...])(-n [N ...] | --n_range N_RANGE [N_RANGE ...]) (-k [K ...] | --k_range K_RANGE [K_RANGE ...])
[--num_iterations NUM_ITERATIONS] [--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE][--output_file OUTPUT_FILE]
[--notes NOTES] [--verbose | --no-verbose]

options:
-h, --help show this help message and exit
Expand All @@ -72,6 +75,7 @@ options:
--cuda_device CUDA_DEVICE
The cuda device to run the benchmark on
--output_file OUTPUT_FILE
--notes NOTES benchmark-specific notes to add to the output_file's header
--verbose, --no-verbose
log to stdout besides output_file? (default: True)
```
Expand All @@ -92,6 +96,7 @@ usage: transformer_flops.py [-h]
(--tensor_mp_size TENSOR_MP_SIZE [TENSOR_MP_SIZE ...] | --tensor_mp_size_range TENSOR_MP_SIZE_RANGE [TENSOR_MP_SIZE_RANGE ...])
[--blocks BLOCKS [BLOCKS ...]] [--use_flash] [--num_iterations NUM_ITERATIONS]
[--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE] [--output_file OUTPUT_FILE]
[--notes NOTES] [--verbose | --no-verbose]

options:
-h, --help show this help message and exit
Expand Down Expand Up @@ -135,6 +140,7 @@ options:
--cuda_device CUDA_DEVICE
The cuda device to run the benchmark on
--output_file OUTPUT_FILE
--notes NOTES benchmark-specific notes to add to the output_file's header
--verbose, --no-verbose
log to stdout besides output_file? (default: True)
```
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/sizing/bmm_flops.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import argparse
import os

from utils import Tee, benchmark_bmm
from utils import Tee, benchmark_bmm, print_benchmark_header

file_dir = os.path.abspath(os.path.dirname(__file__))

Expand All @@ -30,6 +30,7 @@
parser.add_argument("--num_iterations", type=int, default=200, help='The number of iterations used to benchmark each BMM')
parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header")
parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/bmm.out")
parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?')
args = parser.parse_args()
Expand All @@ -56,6 +57,7 @@
torch.cuda.set_device(f"cuda:{args.cuda_device}")

sys.stdout = Tee(args.output_file, args.verbose)
print_benchmark_header(args.notes)

# loop through all sizes to benchmark
for B in b:
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/sizing/mm_flops.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import argparse
import os

from utils import Tee, benchmark_mm
from utils import Tee, benchmark_mm, print_benchmark_header

file_dir = os.path.abspath(os.path.dirname(__file__))

Expand All @@ -27,6 +27,7 @@
parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/mm.out")
parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header")
parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?')
args = parser.parse_args()

Expand All @@ -48,6 +49,7 @@
torch.cuda.set_device(f"cuda:{args.cuda_device}")

sys.stdout = Tee(args.output_file, args.verbose)
print_benchmark_header(args.notes)

# loop through all sizes to benchmark
for M in m:
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/sizing/transformer_flops.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ def benchmark_transformer(c_args,configuration, seq_length, global_batch_size, n
parser.add_argument("--num_iterations", type=int, default=200, help='The number of iterations used to benchmark each BMM')
parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header")
parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/mm.out")
parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?')
args = parser.parse_args()
Expand Down Expand Up @@ -272,6 +273,7 @@ def benchmark_transformer(c_args,configuration, seq_length, global_batch_size, n
torch.cuda.set_device(f"cuda:{args.cuda_device}")

sys.stdout = Tee(args.output_file, args.verbose)
print_benchmark_header(args.notes)

configurations = []
for train_batch_size in global_batch_size:
Expand Down
24 changes: 24 additions & 0 deletions benchmarks/sizing/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import platform
import sys
import shlex
import time
import torch
import numpy as np
from pathlib import Path
Expand All @@ -10,6 +13,27 @@
from megatron.model.gpt2_model import gpt2_attention_mask_func as attention_mask_func
from megatron.model.word_embeddings import Embedding

def print_benchmark_header(notes="None"):

print(f"""
Benchmark started on {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}

** Command line:
{sys.executable} {" ".join(map(shlex.quote, sys.argv))}

** Platform:
{" ".join(platform.uname())}

** Critical component versions:
torch={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}

** Additional notes:
{notes}

{"-" * 80}

""")

class Tee(object):
def __init__(self, filename, verbose):
Path(filename).resolve().parent.mkdir(parents=True, exist_ok=True)
Expand Down