diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 9b3f73a568c2..b54ca69737df 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -52,7 +52,7 @@ jobs: commit_id=$GITHUB_SHA fi commit_msg=$(git show -s --format=%s | cut -c1-70) - python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --cross-generate --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID" + python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --level 2 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID" env: HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} PUSH_TO_HUB_TOKEN: ${{ secrets.PUSH_TO_HUB_TOKEN }} diff --git a/benchmark_v2/framework/benchmark_config.py b/benchmark_v2/framework/benchmark_config.py index 7e66837c2465..52e6f89956ed 100644 --- a/benchmark_v2/framework/benchmark_config.py +++ b/benchmark_v2/framework/benchmark_config.py @@ -1,4 +1,5 @@ import hashlib +import itertools import json import logging from typing import Any @@ -146,60 +147,68 @@ def from_dict(cls, data: dict[str, Any], skip_validity_check: bool = False) -> " ) -def cross_generate_configs( - attn_impl_and_sdpa_backend: list[tuple[str, str | None]], - compiled_mode: list[str | None], - kernelized: list[bool], - warmup_iterations: int = 5, - measurement_iterations: int = 20, - batch_size: int = 1, - sequence_length: int = 128, - num_tokens_to_generate: int = 128, - gpu_monitoring: bool = True, +def adapt_configs( + configs: list[BenchmarkConfig], + warmup_iterations: int | list[int] = 5, + measurement_iterations: int | list[int] = 20, + batch_size: int | list[int] = 1, + sequence_length: int | list[int] = 128, + num_tokens_to_generate: int | list[int] = 128, + gpu_monitoring: bool | list[bool] = True, ) -> list[BenchmarkConfig]: - # Create kwargs common to all configs - kwargs = { - "warmup_iterations": warmup_iterations, - "measurement_iterations": measurement_iterations, - "batch_size": batch_size, - "sequence_length": sequence_length, - "num_tokens_to_generate": num_tokens_to_generate, - "gpu_monitoring": gpu_monitoring, - } - # Cross-generate all combinations of attn_implementation, compiled_mode, and kernelized + parameters = ( + x if isinstance(x, list) else [x] + for x in [ + warmup_iterations, + measurement_iterations, + batch_size, + sequence_length, + num_tokens_to_generate, + gpu_monitoring, + ] + ) + iterator = itertools.product(*parameters) + + adapted_configs = [] + for warmup_iters, measurement_iters, bs, seqlen, ntok, monitor in iterator: + for config in configs: + config = config.to_dict() + config["warmup_iterations"] = warmup_iters + config["measurement_iterations"] = measurement_iters + config["batch_size"] = bs + config["sequence_length"] = seqlen + config["num_tokens_to_generate"] = ntok + config["gpu_monitoring"] = monitor + adapted_configs.append(BenchmarkConfig.from_dict(config)) + return adapted_configs + + +def get_config_by_level(level: int) -> list[BenchmarkConfig]: configs = [] - for attn_implementation, sdpa_backend in list(dict.fromkeys(attn_impl_and_sdpa_backend)): - for cm in list(dict.fromkeys(compiled_mode)): - for kernelize_on in list(dict.fromkeys(kernelized)): - config = BenchmarkConfig( - attn_implementation=attn_implementation, - sdpa_backend=sdpa_backend, - compile_mode=cm, - kernelize=kernelize_on, - **kwargs, - ) - configs.append(config) + # Early return if level is greater than 3: we generate all combinations of configs, maybe even w/ all compile modes + if level >= 3: + for attn_implementation, sdpa_backend in BenchmarkConfig.all_attn_implementations: + # Usually there is not much to gain by compiling with other modes, but we allow it for level 4 + compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"] + for cm in compile_modes: + for kernelize_on in [False, KERNELIZATION_AVAILABLE]: + configs.append( + BenchmarkConfig( + attn_implementation=attn_implementation, + sdpa_backend=sdpa_backend, + compile_mode=cm, + kernelize=kernelize_on, + ) + ) + return configs + # Otherwise, we add the configs for the given level + if level >= 0: + configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default")) + if level >= 1: + configs.append(BenchmarkConfig(attn_implementation="flash_attention_2")) + configs.append(BenchmarkConfig(attn_implementation="eager", compile_mode="default")) + if level >= 2: + configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default")) + configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True)) + configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True)) return configs - - -def generate_main_configs( - warmup_iterations: int = 5, - measurement_iterations: int = 20, - batch_size: int = 1, - sequence_length: int = 128, - num_tokens_to_generate: int = 128, -) -> list[BenchmarkConfig]: - # Create kwargs common to all configs - kwargs = { - "warmup_iterations": warmup_iterations, - "measurement_iterations": measurement_iterations, - "batch_size": batch_size, - "sequence_length": sequence_length, - "num_tokens_to_generate": num_tokens_to_generate, - } - return [ # TODO: test max-autotune instead of default - BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=False, **kwargs), - BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=True, **kwargs), - BenchmarkConfig(attn_implementation="eager", compile_mode="default", gpu_monitoring=True, **kwargs), - BenchmarkConfig(attn_implementation="flash_attention_2", gpu_monitoring=True, **kwargs), - ] diff --git a/benchmark_v2/run_benchmarks.py b/benchmark_v2/run_benchmarks.py index 3b01af6017c4..93a6628085cf 100755 --- a/benchmark_v2/run_benchmarks.py +++ b/benchmark_v2/run_benchmarks.py @@ -23,12 +23,7 @@ import sys import uuid -from framework.benchmark_config import ( - KERNELIZATION_AVAILABLE, - BenchmarkConfig, - cross_generate_configs, - generate_main_configs, -) +from framework.benchmark_config import adapt_configs, get_config_by_level from framework.benchmark_runner import BenchmarkRunner @@ -45,7 +40,14 @@ parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length") parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate") - parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs") + parser.add_argument( + "--level", + type=int, + default=1, + help="Level of coverage for the benchmark. 0: only the main config, 1: a few important configs, 2: a config for" + " each attn implementation an option, 3: cross-generate all combinations of configs, 4: cross-generate all" + " combinations of configs w/ all compile modes", + ) parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile") parser.add_argument("--branch-name", type=str, help="Git branch name") @@ -84,67 +86,24 @@ "At least one of the arguments --batch-size, --sequence-length, or --num-tokens-to-generate is required" ) - # If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs - elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1: - if args.cross_generate: - benchmark_configs = cross_generate_configs( - attn_impl_and_sdpa_backend=BenchmarkConfig.all_attn_implementations, - compiled_mode=[None, "default"], # usually there is not much to gain by compiling with other modes - kernelized=[False, KERNELIZATION_AVAILABLE], - warmup_iterations=args.warmup, - measurement_iterations=args.iterations, - batch_size=args.batch_size[0], - sequence_length=args.sequence_length[0], - num_tokens_to_generate=args.num_tokens_to_generate[0], - gpu_monitoring=not args.no_gpu_monitoring, - ) - else: - benchmark_configs = generate_main_configs( - warmup_iterations=args.warmup, - measurement_iterations=args.iterations, - batch_size=args.batch_size[0], - sequence_length=args.sequence_length[0], - num_tokens_to_generate=args.num_tokens_to_generate[0], - ) - - # Otherwise, we benchmark across all combinations of dimensions - else: - main_config = generate_main_configs( - warmup_iterations=args.warmup, - measurement_iterations=args.iterations, - batch_size=args.batch_size[0], - sequence_length=args.sequence_length[0], - num_tokens_to_generate=args.num_tokens_to_generate[0], - )[0] - benchmark_configs = [] - for num_tokens_to_generate in args.num_tokens_to_generate: - for sequence_length in args.sequence_length: - for batch_size in args.batch_size: - cfg_dict = main_config.to_dict() - cfg_dict["batch_size"] = batch_size - cfg_dict["sequence_length"] = sequence_length - cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate - cfg_dict.pop("name") - benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict)) - - runner = BenchmarkRunner( - logger, - args.output_dir, - args.branch_name, - args.commit_id, - args.commit_message, + # Get the configs for the given coverage level + configs = get_config_by_level(args.level) + # Adapt the configs to the given arguments + configs = adapt_configs( + configs, + args.warmup, + args.iterations, + args.batch_size, + args.sequence_length, + args.num_tokens_to_generate, + not args.no_gpu_monitoring, ) + + runner = BenchmarkRunner(logger, args.output_dir, args.branch_name, args.commit_id, args.commit_message) timestamp, results = runner.run_benchmarks( - args.model_id, - benchmark_configs, - args.num_tokens_to_profile, - pretty_print_summary=True, + args.model_id, configs, args.num_tokens_to_profile, pretty_print_summary=True ) dataset_id = args.push_result_to_dataset if dataset_id is not None and len(results) > 0: - runner.push_results_to_hub( - dataset_id, - results, - timestamp, - ) + runner.push_results_to_hub(dataset_id, results, timestamp)