NVIDIA-NeMo · rhmukundan · Feb 11, 2026 · Feb 16, 2026 · Feb 16, 2026 · Feb 17, 2026
diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py
@@ -9,6 +9,7 @@
     from .llama3_llm_finetune import (
         llama3_8b_sft_config_gb200,
         llama3_8b_sft_config_h100,
+        llama3_70b_lora_config_b200,
         llama3_70b_lora_config_gb200,
         llama3_70b_lora_config_gb300,
         llama3_70b_lora_config_h100,
@@ -63,6 +64,12 @@
     LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1,
     LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1,
     LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1,
+    LLAMA3_70B_LORA_CONFIG_B200_BF16_V1,
+    LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1,
+    LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1,
+    LLAMA3_70B_LORA_CONFIG_B300_BF16_V1,
+    LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1,
+    LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1,
     LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1,
     LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1,
     LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1,
@@ -205,6 +212,12 @@
     "LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1",
     "LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1",
     "LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1",
+    "LLAMA3_70B_LORA_CONFIG_B200_BF16_V1",
+    "LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1",
+    "LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1",
+    "LLAMA3_70B_LORA_CONFIG_B300_BF16_V1",
+    "LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1",
+    "LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1",
     "LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1",
     "LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1",
     "LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1",
@@ -276,6 +289,7 @@
             "llama3_8b_sft_config_h100",
             "llama3_70b_sft_config_gb200",
             "llama3_70b_sft_config_h100",
+            "llama3_70b_lora_config_b200",
             "llama3_70b_lora_config_gb200",
             "llama3_70b_lora_config_gb300",
             "llama3_70b_lora_config_h100",

diff --git a/scripts/performance/configs/llama/llama3_llm_finetune.py b/scripts/performance/configs/llama/llama3_llm_finetune.py
@@ -259,6 +259,72 @@ def llama3_70b_lora_config_gb200(precision: str = "bf16", config_variant: str =
     return cfg
 
 
+def llama3_70b_lora_config_b300(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
+    """B300, LORA config."""
+    base_cfg = get_workload_base_config(
+        model_family_name="llama",
+        model_recipe_name="llama3_70b",
+        task="lora",
+        gpu="b300",
+        compute_dtype=precision.upper(),
+        config_variant=config_variant,
+    )
+    precision_config = get_precision_config(precision)
+
+    cfg = llama3_70b_finetune_config(
+        peft="lora",
+        precision_config=precision_config,
+        packed_sequence=True,
+        seq_length=4096,
+    )
+    set_llama3_common_peft_configs(cfg)
+    set_workload_base_configs(cfg, base_cfg)
+    # Enable pad_cu_seqlens for CUDA graphs compatibility with packed sequences.
+    # This ensures consistent cu_seqlens tensor shapes across batches, which is required
+    # for CUDA graphs and avoids NaN issues in attention kernels.
+    cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True
+    cfg.dataset.dataset_kwargs["pad_to_max_length"] = True
+    cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=bool(cfg.model.tensor_model_parallel_size > 1))
+
+    # Override target_modules to only apply LoRA to QKV
+    cfg.peft.target_modules = ["linear_qkv"]
+
+    return cfg
+
+
+def llama3_70b_lora_config_b200(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
+    """B200, LORA config."""
+    base_cfg = get_workload_base_config(
+        model_family_name="llama",
+        model_recipe_name="llama3_70b",
+        task="lora",
+        gpu="b200",
+        compute_dtype=precision.upper(),
+        config_variant=config_variant,
+    )
+    precision_config = get_precision_config(precision)
+
+    cfg = llama3_70b_finetune_config(
+        peft="lora",
+        precision_config=precision_config,
+        packed_sequence=True,
+        seq_length=4096,
+    )
+    set_llama3_common_peft_configs(cfg)
+    set_workload_base_configs(cfg, base_cfg)
+    # Enable pad_cu_seqlens for CUDA graphs compatibility with packed sequences.
+    # This ensures consistent cu_seqlens tensor shapes across batches, which is required
+    # for CUDA graphs and avoids NaN issues in attention kernels.
+    cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True
+    cfg.dataset.dataset_kwargs["pad_to_max_length"] = True
+    cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=bool(cfg.model.tensor_model_parallel_size > 1))
+
+    # Override target_modules to only apply LoRA to QKV
+    cfg.peft.target_modules = ["linear_qkv"]
+
+    return cfg
+
+
 def llama3_70b_lora_config_h100(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
     """H100, LORA config."""
     base_cfg = get_workload_base_config(

diff --git a/scripts/performance/configs/llama/llama3_workload_base_configs.py b/scripts/performance/configs/llama/llama3_workload_base_configs.py
@@ -600,6 +600,45 @@
 LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1 = LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1
 
 
+_LLAMA3_70B_LORA_CONFIG_B300 = replace(
+    BASE_LLAMA3_70B_CONFIG,
+    num_gpus=8,
+    peft="lora",
+    tensor_model_parallel_size=1,
+    pipeline_model_parallel_size=1,
+    context_parallel_size=1,
+    micro_batch_size=1,
+    global_batch_size=32,
+    cuda_graph_impl="transformer_engine",
+    cuda_graph_scope="mlp",
+)
+
+LLAMA3_70B_LORA_CONFIG_B300_BF16_V1 = _LLAMA3_70B_LORA_CONFIG_B300
+LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1 = replace(
+    _LLAMA3_70B_LORA_CONFIG_B300,
+    pipeline_model_parallel_size=2,
+)
+LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1 = LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1
+
+
+_LLAMA3_70B_LORA_CONFIG_B200 = replace(
+    BASE_LLAMA3_70B_CONFIG,
+    num_gpus=8,
+    peft="lora",
+    tensor_model_parallel_size=1,
+    pipeline_model_parallel_size=2,
+    context_parallel_size=1,
+    micro_batch_size=1,
+    global_batch_size=32,
+    cuda_graph_impl="transformer_engine",
+    cuda_graph_scope="mlp",
+)
+
+LLAMA3_70B_LORA_CONFIG_B200_BF16_V1 = _LLAMA3_70B_LORA_CONFIG_B200
+LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1 = _LLAMA3_70B_LORA_CONFIG_B200
+LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1 = _LLAMA3_70B_LORA_CONFIG_B200
+
+
 _LLAMA3_70B_LORA_CONFIG_H100 = replace(
     BASE_LLAMA3_70B_CONFIG,
     num_gpus=8,
@@ -699,6 +738,12 @@
     "LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1",
     "LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1",
     "LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1",
+    "LLAMA3_70B_LORA_CONFIG_B300_BF16_V1",
+    "LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1",
+    "LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1",
+    "LLAMA3_70B_LORA_CONFIG_B200_BF16_V1",
+    "LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1",
+    "LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1",
     "LLAMA3_70B_LORA_CONFIG_H100_BF16_V1",
     "LLAMA3_70B_LORA_CONFIG_H100_FP8_CS_V1",
     "LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1",