diff --git a/nemo/collections/common/metrics/perf_metrics.py b/nemo/collections/common/metrics/perf_metrics.py index efc974d07f35..8bb79bb43f28 100644 --- a/nemo/collections/common/metrics/perf_metrics.py +++ b/nemo/collections/common/metrics/perf_metrics.py @@ -142,6 +142,7 @@ def eval_model_flops(self): "gpt3": flops_formulas.gpt3, "llama2": flops_formulas.llama2, "llama3": flops_formulas.llama3, + "llama4": flops_formulas.llama4, "nemotron": flops_formulas.nemotron, "mixtral": flops_formulas.mixtral, "bert": flops_formulas.bert, diff --git a/nemo/collections/common/parts/perf_metrics_utils.py b/nemo/collections/common/parts/perf_metrics_utils.py index 989e6b341f3a..15768ae3cab3 100644 --- a/nemo/collections/common/parts/perf_metrics_utils.py +++ b/nemo/collections/common/parts/perf_metrics_utils.py @@ -23,6 +23,7 @@ "gpt3": 51200, "llama2": 32000, "llama3": 128256, + "llama4": 202048, "nemotron": 256000, "bert": 29000, "mixtral": 32000, @@ -45,7 +46,7 @@ def read_tb_log(path: str, summary_name: str) -> List: files = glob.glob(f"{path}/events*tfevents*") files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x))) if len(files) == 0 or not os.path.isfile(files[0]): - raise FileNotFoundError(f"Missing TensorBoard log file.") + raise FileNotFoundError("Missing TensorBoard log file.") events_file = files[0] try: diff --git a/nemo/lightning/pytorch/callbacks/flops_callback.py b/nemo/lightning/pytorch/callbacks/flops_callback.py index 86185b87f8fc..d58864c103c5 100644 --- a/nemo/lightning/pytorch/callbacks/flops_callback.py +++ b/nemo/lightning/pytorch/callbacks/flops_callback.py @@ -31,7 +31,7 @@ "gpt3": flops_formulas.gpt3, "llama2": flops_formulas.llama2, "llama3": flops_formulas.llama3, - "llama4": flops_formulas.llama3, # TODO: add llama4 flops formulas + "llama4": flops_formulas.llama4, "nemotron3": flops_formulas.nemotron, "nemotron4": flops_formulas.nemotron, "mixtral": flops_formulas.mixtral, diff --git a/nemo/utils/flops_formulas.py b/nemo/utils/flops_formulas.py index 74b88b0ad9f0..d8ac275e7e65 100644 --- a/nemo/utils/flops_formulas.py +++ b/nemo/utils/flops_formulas.py @@ -117,6 +117,89 @@ def llama3(config: FLOPSConfig): ) +def llama4(config: FLOPSConfig): + """Model FLOPs for Llama4 family (MoE architecture) + + Llama4 models: + - Scout (16E): All 48 layers use MoE + - Maverick (128E): Alternating dense/MoE pattern via moe_layer_freq + + The formula accounts for: + 1. Attention computation (GQA) - same across all layers + 2. Mixed FFN layers - dense and MoE based on moe_layer_freq pattern + 3. Shared expert computation (for MoE layers) + 4. Embedding/vocabulary projection + """ + vocab_size = LLM_VOCAB_SIZE_MAP["llama4"] + causal_self_attn = True + seq_len = config.enc_seq_len + hidden_size = config.hs + + # Attention FLOPs (same for all layers, using GQA like Llama 3) + # QKV projections + attention computation + output projection + attention_flops = ( + config.gbs + * seq_len + * config.layers + * hidden_size + * hidden_size + * ( + 12 # Q projection + + (12 * config.query_groups / config.attention_heads) # KV projections (GQA) + + (12 * seq_len / hidden_size) * (0.5 if causal_self_attn else 1) # Attention computation + ) + ) + + # FFN FLOPs - need to account for both dense and MoE layers + # Create moe_layer_pattern: 0=dense, 1=MoE + if config.moe_layer_freq is None: + # If no pattern specified (e.g., 16E model), all layers are MoE + moe_layer_pattern = [1] * config.layers + elif isinstance(config.moe_layer_freq, int): + # If integer, create pattern (e.g., freq=2 means every 2nd layer is MoE) + moe_layer_pattern = [1 if (i % config.moe_layer_freq == 0) else 0 for i in range(config.layers)] + else: + # If list, use it directly (e.g., [0,1]*24 for 128E model) + moe_layer_pattern = config.moe_layer_freq + + # Calculate FLOPs for each layer type + num_dense_layers = sum(1 for x in moe_layer_pattern if x == 0) + num_moe_layers = sum(1 for x in moe_layer_pattern if x == 1) + + # Dense layer FFN FLOPs (standard gated FFN) + # Factor of 18 = 6 (fwd+bwd) * 3 (up+gate+down projections) + dense_ffn_flops = 6 * config.gbs * seq_len * num_dense_layers * hidden_size * config.ffn_hs * 3 + + # MoE layer FFN FLOPs + # Shared experts (always active) + routed experts (top-k) + # Each expert has gated FFN: up_proj, gate_proj, down_proj + moe_ffn_flops = 0 + if num_moe_layers > 0: + # Shared expert FLOPs (always computed) + shared_expert_flops = ( + 6 * config.gbs * seq_len * num_moe_layers * hidden_size * config.moe_shared_expert_intermediate_size * 3 + ) + + # Routed expert FLOPs (only top-k experts per token) + routed_expert_flops = ( + 6 + * config.gbs + * seq_len + * num_moe_layers + * hidden_size + * config.moe_ffn_hidden_size + * config.moe_router_topk + * 3 + ) + + moe_ffn_flops = shared_expert_flops + routed_expert_flops + + # Vocabulary/Embedding FLOPs + vocab_flops = 6 * config.gbs * seq_len * hidden_size * vocab_size + + return attention_flops + dense_ffn_flops + moe_ffn_flops + vocab_flops + + def nemotron(config: FLOPSConfig): """Model FLOPs for nemotron family""" vocab_size = LLM_VOCAB_SIZE_MAP["nemotron"]