-
Notifications
You must be signed in to change notification settings - Fork 43
G/benchmark #176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
+326
−15
Closed
G/benchmark #176
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,177 @@ | ||
| import time | ||
|
|
||
| from hud.rl.logger import console | ||
| import torch | ||
| from torch import nn | ||
| from transformers import PretrainedConfig | ||
| from hud.rl.utils import get_world_size | ||
|
|
||
|
|
||
| class PerfCounter: | ||
| """ | ||
| A class to count throughput (tokens/s) with a rolling window to obtain | ||
| precise throughput and MFU estimates. | ||
|
|
||
| Inspired from https://github.com/pytorch/torchtitan/blob/4b3f2e41a084bf79a8540068ed525539d1244edd/torchtitan/utils.py#L119 | ||
| """ | ||
|
|
||
| def __init__(self, model: nn.Module, seq_len: int, window_size: int): | ||
| self.window_size = window_size | ||
| self.tokens = [] | ||
| self.times = [] | ||
| self.model = model | ||
|
|
||
|
|
||
| if torch.cuda.is_available(): | ||
| self.gpu_peak_flops = self._get_peak_flops(torch.cuda.get_device_name(torch.device("cuda"))) | ||
| else: | ||
| self.gpu_peak_flops = 0 | ||
| # If not tie_word_embeddings, we exclude the embedding parameters from the total number of parameters | ||
| # If tie_word_embeddings, the embedding parameters are already excluded (shared with the LM head) | ||
| self.num_params = self._get_num_params(model, exclude_embedding=not model.config.tie_word_embeddings) | ||
| self.num_flop_per_token = self._get_num_flop_per_token(self.num_params, model.config, seq_len=seq_len) | ||
|
|
||
| def count_tokens(self, tokens: int): | ||
| self.tokens.append(tokens) | ||
| self.times.append(time.perf_counter()) | ||
| if len(self.tokens) > self.window_size: | ||
| self.tokens.pop(0) | ||
| self.times.pop(0) | ||
|
|
||
| def get_tokens_per_second(self) -> float | None: | ||
| if len(self.tokens) < 2: | ||
| return None | ||
| return sum(self.tokens[1:]) / (self.times[-1] - self.times[0]) | ||
|
|
||
| def get_mfu(self) -> float | None: | ||
| tokens_per_second = self.get_tokens_per_second() | ||
| if tokens_per_second is None: | ||
| return None | ||
| return 100 * self.num_flop_per_token * tokens_per_second / self.gpu_peak_flops / get_world_size() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Double Normalization in Distributed MFU CalculationThe MFU calculation in Additional Locations (1) |
||
|
|
||
| def _get_peak_flops(self, device_name: str) -> float: | ||
| """ | ||
| Peak BF16 FLOPs (without sparsity) | ||
|
|
||
| From: https://github.com/pytorch/torchtitan/blob/05e47c38d99fdb1dd39aeba76f080e529a425c5c/torchtitan/tools/utils.py#L69 | ||
| """ | ||
| if "A100" in device_name: | ||
| # https://www.nvidia.com/en-us/data-center/a100/ | ||
| return 312e12 | ||
| if "H100" in device_name or "H200" in device_name: | ||
| # https://www.nvidia.com/en-us/data-center/h100/ | ||
| # https://resources.nvidia.com/en-us-data-center-overview-mc/en-us-data-center-overview/hpc-datasheet-sc23-h200 | ||
| if "NVL" in device_name: | ||
| return 835e12 | ||
| elif "PCIe" in device_name: | ||
| return 756e12 | ||
| else: # For H100 SXM and other variants | ||
| return 989e12 | ||
| if "B200" in device_name: | ||
| # https://nvdam.widen.net/s/wwnsxrhm2w/blackwell-datasheet-3384703 | ||
| return 2.25e15 # This is half of the FLOPS reported in torchtitan | ||
| else: | ||
| console.warning_log(f"Peak FLOPS undefined for `{device_name}`. Falling back to A100 (312 TFLOPS)") | ||
| return 312e12 | ||
|
|
||
| @staticmethod | ||
| def get_active_mm_params(config: PretrainedConfig) -> float: | ||
| """Get number of active parameters per token involved in matmuls""" | ||
| vocab_size = config.vocab_size | ||
| hidden_size = config.hidden_size | ||
| intermediate_size = config.intermediate_size | ||
| num_attention_heads = config.num_attention_heads | ||
| head_dim = hidden_size // num_attention_heads | ||
| num_hidden_layers = config.num_hidden_layers | ||
|
|
||
| ## Attention | ||
| if hasattr(config, "q_lora_rank") and hasattr(config, "kv_lora_rank"): | ||
| # MLA | ||
| q_params = num_hidden_layers * ( | ||
| hidden_size * config.q_lora_rank + config.q_lora_rank * num_attention_heads * config.qk_head_dim | ||
| ) | ||
| kv_params = num_hidden_layers * ( | ||
| hidden_size * (config.kv_lora_rank + config.qk_rope_head_dim) | ||
| + config.kv_lora_rank * num_attention_heads * (config.qk_nope_head_dim + config.v_head_dim) | ||
| ) | ||
| o_params = num_hidden_layers * (num_attention_heads * config.v_head_dim * hidden_size) | ||
| else: | ||
| # GQA | ||
| num_key_value_heads = config.num_key_value_heads | ||
| q_params = num_hidden_layers * hidden_size * num_attention_heads * head_dim | ||
| kv_params = 2 * num_hidden_layers * hidden_size * num_key_value_heads * head_dim | ||
| o_params = num_hidden_layers * hidden_size * num_attention_heads * head_dim | ||
|
|
||
| ## MLP | ||
| if hasattr(config, "first_k_dense_replace"): | ||
| num_dense_layers = config.first_k_dense_replace | ||
| num_sparse_layers = config.num_hidden_layers - num_dense_layers | ||
| elif hasattr(config, "num_experts_per_tok"): | ||
| num_dense_layers = 0 | ||
| num_sparse_layers = config.num_hidden_layers | ||
| else: | ||
| num_dense_layers = config.num_hidden_layers | ||
| num_sparse_layers = 0 | ||
|
|
||
| dense_mlp_params = num_dense_layers * 3 * intermediate_size * hidden_size | ||
| sparse_mlp_params = 0 | ||
| if hasattr(config, "num_shared_experts"): # Shared experts | ||
| sparse_mlp_params += ( | ||
| num_sparse_layers * config.num_shared_experts * 3 * config.moe_intermediate_size * hidden_size | ||
| ) | ||
| if hasattr(config, "num_experts_per_tok"): # Routed experts | ||
| sparse_mlp_params += ( | ||
| num_sparse_layers * config.num_experts_per_tok * 3 * config.moe_intermediate_size * hidden_size | ||
| ) | ||
| if hasattr(config, "n_routed_experts"): # DeepSeek Router | ||
| sparse_mlp_params += num_sparse_layers * config.n_routed_experts * hidden_size | ||
| elif hasattr(config, "num_experts"): # Qwen Router | ||
| sparse_mlp_params += num_sparse_layers * config.num_experts * hidden_size | ||
| else: | ||
| sparse_mlp_params = 0 | ||
|
|
||
| ## LM Head | ||
| lm_head_params = vocab_size * hidden_size | ||
| ## Total | ||
| return q_params + kv_params + o_params + dense_mlp_params + sparse_mlp_params + lm_head_params | ||
|
|
||
| def _get_num_flop_per_token(self, num_params: int, model_config: PretrainedConfig, seq_len: int) -> int: | ||
| l, h, q, t = ( | ||
| model_config.num_hidden_layers, | ||
| model_config.num_attention_heads, | ||
| model_config.hidden_size // model_config.num_attention_heads, | ||
| seq_len, | ||
| ) | ||
| # Reasoning behind the factor of 12 for the self-attention part of the formula: | ||
| # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) | ||
| # 2. the flash attention does 1 more matmul recomputation in the backward | ||
| # but recomputation should not be counted in calculating MFU (+0) | ||
| # 3. each matmul performs 1 multiplication and 1 addition (*2) | ||
| # 4. we follow the convention and do not account for sparsity in causal attention | ||
| try: | ||
| flop_per_token = 6 * self.get_active_mm_params(model_config) + 12 * l * h * q * t | ||
| except Exception as e: | ||
| console.warning_log(f"Error calculating flop_per_token using get_active_mm_params: {e}") | ||
| flop_per_token = 6 * num_params + 12 * l * h * q * t | ||
|
|
||
| return flop_per_token | ||
|
|
||
| def _get_num_params(self, model: nn.Module, exclude_embedding: bool = False) -> int: | ||
| num_params = sum(p.numel() for p in model.parameters()) | ||
| if exclude_embedding: | ||
| if hasattr(model.lm_head, "weight"): | ||
| num_params -= model.lm_head.weight.numel() | ||
| elif hasattr(model.lm_head, "base_layer"): # LoRALinear | ||
| num_params -= model.lm_head.base_layer.weight.numel() | ||
| return num_params | ||
|
|
||
|
|
||
| _PERF_COUNTER: PerfCounter | None = None | ||
|
|
||
|
|
||
| def get_perf_counter(model: nn.Module, seq_len: int, window_size: int = 10) -> PerfCounter: | ||
| global _PERF_COUNTER | ||
| if _PERF_COUNTER is None: | ||
| _PERF_COUNTER = PerfCounter(model, seq_len, window_size) | ||
|
|
||
| return _PERF_COUNTER | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Each rank already computes MFU in
PerfCounter.get_mfuas100 * flop_per_token * tokens_per_second / gpu_peak_flops / get_world_size(). When the results are gathered later they are averaged across ranks, so the world-size factor is applied twice. With two ranks, a device running at 60% MFU will be reported as only 30%, making the new benchmark output misleading. MFU should be computed per device (no division by world size) and then averaged or aggregated once.Useful? React with 👍 / 👎.