NVIDIA-NeMo · malay-nagda · Feb 23, 2026 · Feb 17, 2026 · Feb 17, 2026 · Feb 18, 2026
diff --git a/scripts/performance/configs/llama/llama3_workload_base_configs.py b/scripts/performance/configs/llama/llama3_workload_base_configs.py
@@ -500,8 +500,8 @@
     num_gpus=32,
     peft="none",
     tensor_model_parallel_size=1,
-    pipeline_model_parallel_size=4,
-    virtual_pipeline_model_parallel_size=5,
+    pipeline_model_parallel_size=2,
+    virtual_pipeline_model_parallel_size=20,
     micro_batch_size=1,
     global_batch_size=32,
     cuda_graph_impl="transformer_engine",
@@ -517,17 +517,20 @@
     BASE_LLAMA3_70B_CONFIG,
     num_gpus=32,
     peft="none",
-    tensor_model_parallel_size=2,
-    pipeline_model_parallel_size=4,
-    virtual_pipeline_model_parallel_size=5,
+    tensor_model_parallel_size=1,
+    pipeline_model_parallel_size=8,
+    virtual_pipeline_model_parallel_size=10,
     micro_batch_size=1,
     global_batch_size=32,
     cuda_graph_impl="transformer_engine",
     cuda_graph_scope="mlp",
 )
 
 LLAMA3_70B_SFT_CONFIG_GB200_BF16_V1 = _LLAMA3_70B_SFT_CONFIG_GB200
-LLAMA3_70B_SFT_CONFIG_GB200_FP8_CS_V1 = _LLAMA3_70B_SFT_CONFIG_GB200
+LLAMA3_70B_SFT_CONFIG_GB200_FP8_CS_V1 = replace(
+    _LLAMA3_70B_SFT_CONFIG_GB200,
+    pipeline_model_parallel_size=4,
+)
 LLAMA3_70B_SFT_CONFIG_GB200_FP8_MX_V1 = _LLAMA3_70B_SFT_CONFIG_GB200
 
 
@@ -543,8 +546,11 @@
 )
 
 LLAMA3_70B_SFT_CONFIG_H100_BF16_V1 = _LLAMA3_70B_SFT_CONFIG_H100
-LLAMA3_70B_SFT_CONFIG_H100_FP8_CS_V1 = _LLAMA3_70B_SFT_CONFIG_H100
-
+LLAMA3_70B_SFT_CONFIG_H100_FP8_CS_V1 = replace(
+    _LLAMA3_70B_SFT_CONFIG_H100,
+    cuda_graph_impl="transformer_engine",
+    cuda_graph_scope="mlp",
+)
 
 # =============================================================================
 # Llama3 70B finetune (LoRA) presets - V1 (only version)

diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
@@ -292,6 +292,12 @@ def _set_model_specific_environment_variables(
             if model_family_name == "deepseek":
                 if compute_dtype == "fp8_mx":
                     del_cudnn_ln = False
+        if model_family_name in ["llama"] and train_task in ["sft"]:
+            # TODO: Verify for H100 and 8b
+            del_cudnn_ln = False
+            if gpu in ["h100"] and model_recipe_name in ["llama3_70b"] and compute_dtype == "fp8_cs":
+                executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+                executor.env_vars["NCCL_GRAPH_REGISTER"] = "0"
         if del_cudnn_ln:
             if "NVTE_NORM_FWD_USE_CUDNN" in executor.env_vars:
                 executor.env_vars.pop("NVTE_NORM_FWD_USE_CUDNN")
@@ -448,6 +454,9 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
             if self.model_recipe_name in ["llama3_70b", "llama31_405b"] and self.train_task == "pretrain"
             else None
         )
+        nccl_pp_comm_chunksize = (
+            2097152 if self.model_family_name in ["llama"] and self.train_task in ["sft"] else None
+        )
         self._set_nccl_pp_comm_chunksize(task, executor, nccl_pp_comm_chunksize, pp_size)
 
         # Configure manual garbage collection