Skip to content
38 changes: 36 additions & 2 deletions src/megatron/bridge/training/utils/flop_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@

import torch.nn.functional as F

from megatron.bridge.peft.lora import LoRA
from megatron.bridge.training.config import ConfigContainer
from megatron.bridge.utils.vocab_utils import calculate_padded_vocab_size


def num_floating_point_operations(cfg: ConfigContainer, batch_size: int = 1):
"""Return the number of floating point operations"""
# If the model provider has a custom TFLOPS calculation method, use it.
if hasattr(cfg.model, "_get_num_floating_point_operations"):
peft = getattr(cfg, "peft", None)
is_lora = isinstance(peft, LoRA)
# If the model provider has a custom TFLOPS calculation method, use it (non-LoRA only).
if not is_lora and hasattr(cfg.model, "_get_num_floating_point_operations"):
return cfg.model._get_num_floating_point_operations(batch_size)

def calculate_layer_counts():
Expand Down Expand Up @@ -183,6 +186,37 @@ def transformer_flops():
num_query_groups = (
cfg.model.num_attention_heads if cfg.model.num_query_groups is None else cfg.model.num_query_groups
)

if is_lora:
_LORA_SEQ_STATS = {
4096: (842603, 4096),
2048: (488991, 2030),
}
seq_len = cfg.model.seq_length
if seq_len not in _LORA_SEQ_STATS:
raise ValueError(f"No LoRA stats for seq_length={seq_len}. Add it to _LORA_SEQ_STATS.")
avg_seqlen2, avg_tokens = _LORA_SEQ_STATS[seq_len]

hs = cfg.model.hidden_size
n_layers = cfg.model.num_layers
n_heads = cfg.model.num_attention_heads
ffn_hs = cfg.model.ffn_hidden_size
vocab_size = cfg.model.vocab_size
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

LoRA path uses raw vocab_size; non-LoRA path uses padded vocab size — inconsistency may skew estimates.

Line 208 reads cfg.model.vocab_size directly, while the non-LoRA transformer_flops path calls calculate_padded_vocab_size(...) (line 358) which accounts for tensor-parallel padding. The resulting FLOPs estimate for the embedding/logit term will be slightly lower than the padded-vocab equivalent, making LoRA FLOPs non-comparable to the non-LoRA baseline. If this is intentional, add a comment; otherwise align with the padded calculation.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/megatron/bridge/training/utils/flop_utils.py` at line 208, The LoRA
branch sets vocab_size from cfg.model.vocab_size while the non‑LoRA
transformer_flops path uses calculate_padded_vocab_size(...), causing
inconsistent embedding/logit FLOP estimates; update the LoRA path to call
calculate_padded_vocab_size(cfg.model.vocab_size, tp_size, ...) (same args used
by transformer_flops) and use that padded value for embedding/logit FLOP
calculation (or, if intentional, add a clear comment in the LoRA branch
explaining why raw vocab_size is used). Locate the vocab_size assignment in the
LoRA code path and align it with the calculate_padded_vocab_size usage
referenced in transformer_flops.


model_flops_frozen = (
avg_tokens
* n_layers
* hs**2
* (
12
+ 12 * num_query_groups / n_heads
+ 18 * ffn_hs / hs
+ 6 * vocab_size / (n_layers * hs)
)
)
model_flops_unfrozen = n_layers * hs**2 * (12 * avg_seqlen2 / hs)

return batch_size * (model_flops_frozen * (2.0 / 3.0) + model_flops_unfrozen)
# MoE.
if cfg.model.num_moe_experts is None:
# Every Transformer MLP is dense.
Expand Down