-
Notifications
You must be signed in to change notification settings - Fork 43
G/benchmark #176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
G/benchmark #176
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,177 @@ | ||
| import time | ||
|
|
||
| from hud.rl.logger import console | ||
| import torch | ||
| from torch import nn | ||
| from transformers import PretrainedConfig | ||
| from hud.rl.utils import get_world_size | ||
|
|
||
|
|
||
| class PerfCounter: | ||
| """ | ||
| A class to count throughput (tokens/s) with a rolling window to obtain | ||
| precise throughput and MFU estimates. | ||
| Inspired from https://github.com/pytorch/torchtitan/blob/4b3f2e41a084bf79a8540068ed525539d1244edd/torchtitan/utils.py#L119 | ||
| """ | ||
|
|
||
| def __init__(self, model: nn.Module, seq_len: int, window_size: int): | ||
| self.window_size = window_size | ||
| self.tokens = [] | ||
| self.times = [] | ||
| self.model = model | ||
|
|
||
|
|
||
| if torch.cuda.is_available(): | ||
| self.gpu_peak_flops = self._get_peak_flops(torch.cuda.get_device_name(torch.device("cuda"))) | ||
| else: | ||
| self.gpu_peak_flops = 0 | ||
| # If not tie_word_embeddings, we exclude the embedding parameters from the total number of parameters | ||
| # If tie_word_embeddings, the embedding parameters are already excluded (shared with the LM head) | ||
| self.num_params = self._get_num_params(model, exclude_embedding=not model.config.tie_word_embeddings) | ||
| self.num_flop_per_token = self._get_num_flop_per_token(self.num_params, model.config, seq_len=seq_len) | ||
|
|
||
| def count_tokens(self, tokens: int): | ||
| self.tokens.append(tokens) | ||
| self.times.append(time.perf_counter()) | ||
| if len(self.tokens) > self.window_size: | ||
| self.tokens.pop(0) | ||
| self.times.pop(0) | ||
|
|
||
| def get_tokens_per_second(self) -> float | None: | ||
| if len(self.tokens) < 2: | ||
| return None | ||
| return sum(self.tokens[1:]) / (self.times[-1] - self.times[0]) | ||
|
|
||
| def get_mfu(self) -> float | None: | ||
| tokens_per_second = self.get_tokens_per_second() | ||
| if tokens_per_second is None: | ||
| return None | ||
| return 100 * self.num_flop_per_token * tokens_per_second / self.gpu_peak_flops / get_world_size() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Double Normalization in Distributed MFU CalculationThe MFU calculation in Additional Locations (1) |
||
|
|
||
| def _get_peak_flops(self, device_name: str) -> float: | ||
| """ | ||
| Peak BF16 FLOPs (without sparsity) | ||
| From: https://github.com/pytorch/torchtitan/blob/05e47c38d99fdb1dd39aeba76f080e529a425c5c/torchtitan/tools/utils.py#L69 | ||
| """ | ||
| if "A100" in device_name: | ||
| # https://www.nvidia.com/en-us/data-center/a100/ | ||
| return 312e12 | ||
| if "H100" in device_name or "H200" in device_name: | ||
| # https://www.nvidia.com/en-us/data-center/h100/ | ||
| # https://resources.nvidia.com/en-us-data-center-overview-mc/en-us-data-center-overview/hpc-datasheet-sc23-h200 | ||
| if "NVL" in device_name: | ||
| return 835e12 | ||
| elif "PCIe" in device_name: | ||
| return 756e12 | ||
| else: # For H100 SXM and other variants | ||
| return 989e12 | ||
| if "B200" in device_name: | ||
| # https://nvdam.widen.net/s/wwnsxrhm2w/blackwell-datasheet-3384703 | ||
| return 2.25e15 # This is half of the FLOPS reported in torchtitan | ||
| else: | ||
| console.warning_log(f"Peak FLOPS undefined for `{device_name}`. Falling back to A100 (312 TFLOPS)") | ||
| return 312e12 | ||
|
|
||
| @staticmethod | ||
| def get_active_mm_params(config: PretrainedConfig) -> float: | ||
| """Get number of active parameters per token involved in matmuls""" | ||
| vocab_size = config.vocab_size | ||
| hidden_size = config.hidden_size | ||
| intermediate_size = config.intermediate_size | ||
| num_attention_heads = config.num_attention_heads | ||
| head_dim = hidden_size // num_attention_heads | ||
| num_hidden_layers = config.num_hidden_layers | ||
|
|
||
| ## Attention | ||
| if hasattr(config, "q_lora_rank") and hasattr(config, "kv_lora_rank"): | ||
| # MLA | ||
| q_params = num_hidden_layers * ( | ||
| hidden_size * config.q_lora_rank + config.q_lora_rank * num_attention_heads * config.qk_head_dim | ||
| ) | ||
| kv_params = num_hidden_layers * ( | ||
| hidden_size * (config.kv_lora_rank + config.qk_rope_head_dim) | ||
| + config.kv_lora_rank * num_attention_heads * (config.qk_nope_head_dim + config.v_head_dim) | ||
| ) | ||
| o_params = num_hidden_layers * (num_attention_heads * config.v_head_dim * hidden_size) | ||
| else: | ||
| # GQA | ||
| num_key_value_heads = config.num_key_value_heads | ||
| q_params = num_hidden_layers * hidden_size * num_attention_heads * head_dim | ||
| kv_params = 2 * num_hidden_layers * hidden_size * num_key_value_heads * head_dim | ||
| o_params = num_hidden_layers * hidden_size * num_attention_heads * head_dim | ||
|
|
||
| ## MLP | ||
| if hasattr(config, "first_k_dense_replace"): | ||
| num_dense_layers = config.first_k_dense_replace | ||
| num_sparse_layers = config.num_hidden_layers - num_dense_layers | ||
| elif hasattr(config, "num_experts_per_tok"): | ||
| num_dense_layers = 0 | ||
| num_sparse_layers = config.num_hidden_layers | ||
| else: | ||
| num_dense_layers = config.num_hidden_layers | ||
| num_sparse_layers = 0 | ||
|
|
||
| dense_mlp_params = num_dense_layers * 3 * intermediate_size * hidden_size | ||
| sparse_mlp_params = 0 | ||
| if hasattr(config, "num_shared_experts"): # Shared experts | ||
| sparse_mlp_params += ( | ||
| num_sparse_layers * config.num_shared_experts * 3 * config.moe_intermediate_size * hidden_size | ||
| ) | ||
| if hasattr(config, "num_experts_per_tok"): # Routed experts | ||
| sparse_mlp_params += ( | ||
| num_sparse_layers * config.num_experts_per_tok * 3 * config.moe_intermediate_size * hidden_size | ||
| ) | ||
| if hasattr(config, "n_routed_experts"): # DeepSeek Router | ||
| sparse_mlp_params += num_sparse_layers * config.n_routed_experts * hidden_size | ||
| elif hasattr(config, "num_experts"): # Qwen Router | ||
| sparse_mlp_params += num_sparse_layers * config.num_experts * hidden_size | ||
| else: | ||
| sparse_mlp_params = 0 | ||
|
|
||
| ## LM Head | ||
| lm_head_params = vocab_size * hidden_size | ||
| ## Total | ||
| return q_params + kv_params + o_params + dense_mlp_params + sparse_mlp_params + lm_head_params | ||
|
|
||
| def _get_num_flop_per_token(self, num_params: int, model_config: PretrainedConfig, seq_len: int) -> int: | ||
| l, h, q, t = ( | ||
| model_config.num_hidden_layers, | ||
| model_config.num_attention_heads, | ||
| model_config.hidden_size // model_config.num_attention_heads, | ||
| seq_len, | ||
| ) | ||
| # Reasoning behind the factor of 12 for the self-attention part of the formula: | ||
| # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) | ||
| # 2. the flash attention does 1 more matmul recomputation in the backward | ||
| # but recomputation should not be counted in calculating MFU (+0) | ||
| # 3. each matmul performs 1 multiplication and 1 addition (*2) | ||
| # 4. we follow the convention and do not account for sparsity in causal attention | ||
| try: | ||
| flop_per_token = 6 * self.get_active_mm_params(model_config) + 12 * l * h * q * t | ||
| except Exception as e: | ||
| console.warning_log(f"Error calculating flop_per_token using get_active_mm_params: {e}") | ||
| flop_per_token = 6 * num_params + 12 * l * h * q * t | ||
|
|
||
| return flop_per_token | ||
|
|
||
| def _get_num_params(self, model: nn.Module, exclude_embedding: bool = False) -> int: | ||
| num_params = sum(p.numel() for p in model.parameters()) | ||
| if exclude_embedding: | ||
| if hasattr(model.lm_head, "weight"): | ||
| num_params -= model.lm_head.weight.numel() | ||
| elif hasattr(model.lm_head, "base_layer"): # LoRALinear | ||
| num_params -= model.lm_head.base_layer.weight.numel() | ||
| return num_params | ||
|
|
||
|
|
||
| _PERF_COUNTER: PerfCounter | None = None | ||
|
|
||
|
|
||
| def get_perf_counter(model: nn.Module, seq_len: int, window_size: int = 10) -> PerfCounter: | ||
| global _PERF_COUNTER | ||
| if _PERF_COUNTER is None: | ||
| _PERF_COUNTER = PerfCounter(model, seq_len, window_size) | ||
|
|
||
| return _PERF_COUNTER | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,8 @@ | |
| from hud.rl.checkpoint import CheckpointManager | ||
| from hud.rl.utils import save_step_metrics | ||
| from hud.rl.types import TrainingSample | ||
| from hud.rl.perf import PerfCounter | ||
| from rich.table import Table | ||
|
|
||
|
|
||
| def get_batch(step: int, root: str) -> list[TrainingSample]: | ||
|
|
@@ -54,6 +56,12 @@ def train( | |
|
|
||
| console.section_title("Initializing trainer") | ||
|
|
||
| if training_config.benchmark: | ||
| if is_main_process(): | ||
| console.warning_log("Running in benchmark mode, overriding max_steps to 5") | ||
| max_steps = min(max_steps, 5) | ||
|
|
||
|
|
||
| parallel_dims = ParallelDims( | ||
| dp_replicate=training_config.dp_replicate, | ||
| dp_shard=training_config.dp_shard, | ||
|
|
@@ -67,6 +75,8 @@ def train( | |
|
|
||
| model = build_model(training_config, parallel_dims) | ||
|
|
||
| benchmark_data = [] | ||
|
|
||
| ref_model: torch.nn.Module | None = None | ||
| if training_config.loss.kl_beta > 0: | ||
| console.info_log("Initializing reference model for KL regularization") | ||
|
|
@@ -82,6 +92,8 @@ def train( | |
|
|
||
| collector = MetricsCollector(distributed=(world_size > 1)) | ||
|
|
||
| perf_counter: PerfCounter | None = None | ||
|
|
||
| for step in range(max_steps): | ||
| collector.reset() | ||
| # Save checkpoint from previous step (skip first step since no training yet) | ||
|
|
@@ -107,7 +119,9 @@ def train( | |
| del logits | ||
| progress.update(f"Computing reference log probabilities... {i + 1}/{len(batch)}") | ||
|
|
||
|
|
||
| if perf_counter is None: | ||
| perf_counter = PerfCounter(model, batch[0].inputs["input_ids"].shape[1], 10) | ||
| perf_counter.count_tokens(0) | ||
|
|
||
| with console.progress("Computing old log probabilities...") as progress, torch.no_grad(): | ||
| for i, minibatch in enumerate(batch): | ||
|
|
@@ -193,18 +207,70 @@ def train( | |
| step_duration = time.time() - training_start_time | ||
| console.info_log(f"Step {step} training took {step_duration:.2f} seconds") | ||
|
|
||
|
|
||
| # Collect performance data | ||
| num_tokens = sum(minibatch.inputs["input_ids"].shape[1] for minibatch in batch) | ||
|
||
| console.warning_log(f"num_tokens: {num_tokens}") | ||
|
||
| perf_counter.count_tokens(num_tokens) # Add to rolling window | ||
| throughput = perf_counter.get_tokens_per_second() or 0 | ||
| mfu = perf_counter.get_mfu() or 0 | ||
| peak_memory = torch.cuda.max_memory_allocated() / 1024**3 | ||
|
|
||
| dist_perf_output_list = [{}] * world_size | ||
| torch.distributed.all_gather_object(dist_perf_output_list, { | ||
| "step_duration": step_duration, | ||
| "throughput": throughput, | ||
| "mfu": mfu, | ||
| "peak_memory": peak_memory, | ||
| }) | ||
|
|
||
| benchmark_data.append({ | ||
| "step": step, | ||
| # max step duration across ranks | ||
| "step_duration": max([x["step_duration"] for x in dist_perf_output_list]), | ||
| # sum throughput across ranks | ||
| "throughput": sum([x["throughput"] for x in dist_perf_output_list]), | ||
| # average mfu across ranks | ||
| "mfu": sum([x["mfu"] for x in dist_perf_output_list])/world_size, | ||
| # sum peak memory across ranks | ||
| "peak_memory": max([x["peak_memory"] for x in dist_perf_output_list]), | ||
| }) | ||
|
|
||
| stats = collector.get_stats() | ||
| if is_main_process(): | ||
| save_step_metrics(training_config.output_dir, step, stats) | ||
|
|
||
|
|
||
|
|
||
| torch.cuda.empty_cache() | ||
|
|
||
| # Save final checkpoint after last training step | ||
| if max_steps > 0: | ||
| console.info(f"Saving final checkpoint for step {max_steps - 1}...") | ||
| checkpoint_manager.save(model, max_steps - 1) | ||
|
|
||
| if training_config.benchmark: | ||
| # Create benchmark table | ||
| table = Table(title="Training Performance Metrics") | ||
| table.add_column("Step", justify="right", style="cyan") | ||
| table.add_column("Duration (s)", justify="right", style="yellow") | ||
| table.add_column("Throughput (tok/s)", justify="right", style="green") | ||
| table.add_column("MFU (%)", justify="right", style="green") | ||
| table.add_column("Peak Memory (GB)", justify="right", style="red") | ||
|
|
||
| # Add rows | ||
| for data in benchmark_data: | ||
| table.add_row( | ||
| str(data["step"]), | ||
| f"{data['step_duration']:.2f}", | ||
| f"{data['throughput']:.0f}", | ||
| f"{data['mfu']:.2f}", | ||
| f"{data['peak_memory']:.2f}", | ||
| ) | ||
|
|
||
| if is_main_process(): | ||
| console.section_title("Benchmark Results") | ||
| console.print(table) | ||
|
|
||
|
|
||
|
|
||
| def main() -> None: | ||
| """Main entry point for training script.""" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Each rank already computes MFU in
PerfCounter.get_mfuas100 * flop_per_token * tokens_per_second / gpu_peak_flops / get_world_size(). When the results are gathered later they are averaged across ranks, so the world-size factor is applied twice. With two ranks, a device running at 60% MFU will be reported as only 30%, making the new benchmark output misleading. MFU should be computed per device (no division by world size) and then averaged or aggregated once.Useful? React with 👍 / 👎.