From 65949ed002e0729381ff3720d86b72f1d965371f Mon Sep 17 00:00:00 2001
From: Raghav Hrishikeshan Mukundan <rmukundan@nvidia.com>
Date: Mon, 16 Feb 2026 14:17:30 -0800
Subject: [PATCH 1/4] Onboarding LLAMA3 70B LoRa to B300 and B200 chips

Signed-off-by: Raghav Hrishikeshan Mukundan <rmukundan@nvidia.com>
---
 scripts/performance/configs/llama/__init__.py |  8 +++
 .../configs/llama/llama3_llm_finetune.py      | 66 +++++++++++++++++++
 .../llama/llama3_workload_base_configs.py     | 45 +++++++++++++
 3 files changed, 119 insertions(+)

diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py
index 61f6c6bc93..dd9f49e832 100644
--- a/scripts/performance/configs/llama/__init__.py
+++ b/scripts/performance/configs/llama/__init__.py
@@ -9,6 +9,7 @@
     from .llama3_llm_finetune import (
         llama3_8b_sft_config_gb200,
         llama3_8b_sft_config_h100,
+        llama3_70b_lora_config_b200,
         llama3_70b_lora_config_gb200,
         llama3_70b_lora_config_gb300,
         llama3_70b_lora_config_h100,
@@ -63,6 +64,9 @@
     LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1,
     LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1,
     LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1,
+    LLAMA3_70B_LORA_CONFIG_B200_BF16_V1,
+    LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1,
+    LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1,
     LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1,
     LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1,
     LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1,
@@ -205,6 +209,9 @@
     "LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1",
     "LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1",
     "LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1",
+    "LLAMA3_70B_LORA_CONFIG_B200_BF16_V1",
+    "LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1",
+    "LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1",
     "LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1",
     "LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1",
     "LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1",
@@ -276,6 +283,7 @@
             "llama3_8b_sft_config_h100",
             "llama3_70b_sft_config_gb200",
             "llama3_70b_sft_config_h100",
+            "llama3_70b_lora_config_b200",
             "llama3_70b_lora_config_gb200",
             "llama3_70b_lora_config_gb300",
             "llama3_70b_lora_config_h100",
diff --git a/scripts/performance/configs/llama/llama3_llm_finetune.py b/scripts/performance/configs/llama/llama3_llm_finetune.py
index eb8038e366..114df86a50 100644
--- a/scripts/performance/configs/llama/llama3_llm_finetune.py
+++ b/scripts/performance/configs/llama/llama3_llm_finetune.py
@@ -259,6 +259,72 @@ def llama3_70b_lora_config_gb200(precision: str = "bf16", config_variant: str =
     return cfg
 
 
+def llama3_70b_lora_config_b300(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
+    """B300, LORA config."""
+    base_cfg = get_workload_base_config(
+        model_family_name="llama",
+        model_recipe_name="llama3_70b",
+        task="lora",
+        gpu="b300",
+        compute_dtype=precision.upper(),
+        config_variant=config_variant,
+    )
+    precision_config = get_precision_config(precision)
+
+    cfg = llama3_70b_finetune_config(
+        peft="lora",
+        precision_config=precision_config,
+        packed_sequence=True,
+        seq_length=4096,
+    )
+    set_llama3_common_peft_configs(cfg)
+    set_workload_base_configs(cfg, base_cfg)
+    # Enable pad_cu_seqlens for CUDA graphs compatibility with packed sequences.
+    # This ensures consistent cu_seqlens tensor shapes across batches, which is required
+    # for CUDA graphs and avoids NaN issues in attention kernels.
+    cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True
+    cfg.dataset.dataset_kwargs["pad_to_max_length"] = True
+    cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=bool(cfg.model.tensor_model_parallel_size > 1))
+
+    # Override target_modules to only apply LoRA to QKV
+    cfg.peft.target_modules = ["linear_qkv"]
+
+    return cfg
+
+
+def llama3_70b_lora_config_b200(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
+    """B200, LORA config."""
+    base_cfg = get_workload_base_config(
+        model_family_name="llama",
+        model_recipe_name="llama3_70b",
+        task="lora",
+        gpu="b200",
+        compute_dtype=precision.upper(),
+        config_variant=config_variant,
+    )
+    precision_config = get_precision_config(precision)
+
+    cfg = llama3_70b_finetune_config(
+        peft="lora",
+        precision_config=precision_config,
+        packed_sequence=True,
+        seq_length=4096,
+    )
+    set_llama3_common_peft_configs(cfg)
+    set_workload_base_configs(cfg, base_cfg)
+    # Enable pad_cu_seqlens for CUDA graphs compatibility with packed sequences.
+    # This ensures consistent cu_seqlens tensor shapes across batches, which is required
+    # for CUDA graphs and avoids NaN issues in attention kernels.
+    cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True
+    cfg.dataset.dataset_kwargs["pad_to_max_length"] = True
+    cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=bool(cfg.model.tensor_model_parallel_size > 1))
+
+    # Override target_modules to only apply LoRA to QKV
+    cfg.peft.target_modules = ["linear_qkv"]
+
+    return cfg
+
+
 def llama3_70b_lora_config_h100(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
     """H100, LORA config."""
     base_cfg = get_workload_base_config(
diff --git a/scripts/performance/configs/llama/llama3_workload_base_configs.py b/scripts/performance/configs/llama/llama3_workload_base_configs.py
index affd813dba..e5129f9069 100644
--- a/scripts/performance/configs/llama/llama3_workload_base_configs.py
+++ b/scripts/performance/configs/llama/llama3_workload_base_configs.py
@@ -600,6 +600,45 @@
 LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1 = LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1
 
 
+_LLAMA3_70B_LORA_CONFIG_B300 = replace(
+    BASE_LLAMA3_70B_CONFIG,
+    num_gpus=8,
+    peft="lora",
+    tensor_model_parallel_size=1,
+    pipeline_model_parallel_size=1,
+    context_parallel_size=1,
+    micro_batch_size=1,
+    global_batch_size=32,
+    cuda_graph_impl="transformer_engine",
+    cuda_graph_scope="mlp",
+)
+
+LLAMA3_70B_LORA_CONFIG_B300_BF16_V1 = _LLAMA3_70B_LORA_CONFIG_B300
+LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1 = replace(
+    _LLAMA3_70B_LORA_CONFIG_B300,
+    pipeline_model_parallel_size=2,
+)
+LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1 = LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1
+
+
+_LLAMA3_70B_LORA_CONFIG_B200 = replace(
+    BASE_LLAMA3_70B_CONFIG,
+    num_gpus=8,
+    peft="lora",
+    tensor_model_parallel_size=1,
+    pipeline_model_parallel_size=2,
+    context_parallel_size=1,
+    micro_batch_size=1,
+    global_batch_size=32,
+    cuda_graph_impl="transformer_engine",
+    cuda_graph_scope="mlp",
+)
+
+LLAMA3_70B_LORA_CONFIG_B200_BF16_V1 = _LLAMA3_70B_LORA_CONFIG_B200
+LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1 = _LLAMA3_70B_LORA_CONFIG_B200
+LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1 = _LLAMA3_70B_LORA_CONFIG_B200
+
+
 _LLAMA3_70B_LORA_CONFIG_H100 = replace(
     BASE_LLAMA3_70B_CONFIG,
     num_gpus=8,
@@ -699,6 +738,12 @@
     "LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1",
     "LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1",
     "LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1",
+    "LLAMA3_70B_LORA_CONFIG_B300_BF16_V1",
+    "LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1",
+    "LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1",
+    "LLAMA3_70B_LORA_CONFIG_B200_BF16_V1",
+    "LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1",
+    "LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1",
     "LLAMA3_70B_LORA_CONFIG_H100_BF16_V1",
     "LLAMA3_70B_LORA_CONFIG_H100_FP8_CS_V1",
     "LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1",

From d17a146f8c7a4cf29ade968115be1c4cd407661e Mon Sep 17 00:00:00 2001
From: Raghav Hrishikeshan Mukundan <rmukundan@nvidia.com>
Date: Mon, 16 Feb 2026 18:18:30 -0800
Subject: [PATCH 2/4] Fix b300 import

Signed-off-by: Raghav Hrishikeshan Mukundan <rmukundan@nvidia.com>
---
 scripts/performance/configs/llama/__init__.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py
index dd9f49e832..99c6c3e2f7 100644
--- a/scripts/performance/configs/llama/__init__.py
+++ b/scripts/performance/configs/llama/__init__.py
@@ -67,6 +67,9 @@
     LLAMA3_70B_LORA_CONFIG_B200_BF16_V1,
     LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1,
     LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1,
+    LLAMA3_70B_LORA_CONFIG_B300_BF16_V1,
+    LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1,
+    LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1,
     LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1,
     LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1,
     LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1,
@@ -212,6 +215,9 @@
     "LLAMA3_70B_LORA_CONFIG_B200_BF16_V1",
     "LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1",
     "LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1",
+    "LLAMA3_70B_LORA_CONFIG_B300_BF16_V1",
+    "LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1",
+    "LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1",
     "LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1",
     "LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1",
     "LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1",

From 7fa8131eeccdf49bb7ea08703883e2d2875081f9 Mon Sep 17 00:00:00 2001
From: Raghav Hrishikeshan Mukundan <rmukundan@nvidia.com>
Date: Tue, 17 Feb 2026 05:43:05 -0800
Subject: [PATCH 3/4] Fix B300 Imports

Signed-off-by: Raghav Hrishikeshan Mukundan <rmukundan@nvidia.com>
---
 scripts/performance/configs/llama/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py
index 99c6c3e2f7..33f390b3ba 100644
--- a/scripts/performance/configs/llama/__init__.py
+++ b/scripts/performance/configs/llama/__init__.py
@@ -10,6 +10,7 @@
         llama3_8b_sft_config_gb200,
         llama3_8b_sft_config_h100,
         llama3_70b_lora_config_b200,
+        llama3_70b_lora_config_b300,
         llama3_70b_lora_config_gb200,
         llama3_70b_lora_config_gb300,
         llama3_70b_lora_config_h100,
@@ -290,6 +291,7 @@
             "llama3_70b_sft_config_gb200",
             "llama3_70b_sft_config_h100",
             "llama3_70b_lora_config_b200",
+            "llama3_70b_lora_config_b300",
             "llama3_70b_lora_config_gb200",
             "llama3_70b_lora_config_gb300",
             "llama3_70b_lora_config_h100",

From dd660c470961a6aa61b7390ae292574240157919 Mon Sep 17 00:00:00 2001
From: Raghav Hrishikeshan Mukundan <rmukundan@nvidia.com>
Date: Thu, 26 Feb 2026 22:20:48 -0800
Subject: [PATCH 4/4] Fix linter errors

Signed-off-by: Raghav Hrishikeshan Mukundan <rmukundan@nvidia.com>
---
 scripts/performance/configs/llama/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py
index 33f390b3ba..c70555dd4e 100644
--- a/scripts/performance/configs/llama/__init__.py
+++ b/scripts/performance/configs/llama/__init__.py
@@ -62,15 +62,15 @@
     LLAMA3_8B_SFT_CONFIG_GB200_FP8_MX_V1,
     LLAMA3_8B_SFT_CONFIG_H100_BF16_V1,
     LLAMA3_8B_SFT_CONFIG_H100_FP8_CS_V1,
-    LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1,
-    LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1,
-    LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1,
     LLAMA3_70B_LORA_CONFIG_B200_BF16_V1,
     LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1,
     LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1,
     LLAMA3_70B_LORA_CONFIG_B300_BF16_V1,
     LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1,
     LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1,
+    LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1,
+    LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1,
+    LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1,
     LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1,
     LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1,
     LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1,