From 65949ed002e0729381ff3720d86b72f1d965371f Mon Sep 17 00:00:00 2001 From: Raghav Hrishikeshan Mukundan Date: Mon, 16 Feb 2026 14:17:30 -0800 Subject: [PATCH 1/4] Onboarding LLAMA3 70B LoRa to B300 and B200 chips Signed-off-by: Raghav Hrishikeshan Mukundan --- scripts/performance/configs/llama/__init__.py | 8 +++ .../configs/llama/llama3_llm_finetune.py | 66 +++++++++++++++++++ .../llama/llama3_workload_base_configs.py | 45 +++++++++++++ 3 files changed, 119 insertions(+) diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py index 61f6c6bc93..dd9f49e832 100644 --- a/scripts/performance/configs/llama/__init__.py +++ b/scripts/performance/configs/llama/__init__.py @@ -9,6 +9,7 @@ from .llama3_llm_finetune import ( llama3_8b_sft_config_gb200, llama3_8b_sft_config_h100, + llama3_70b_lora_config_b200, llama3_70b_lora_config_gb200, llama3_70b_lora_config_gb300, llama3_70b_lora_config_h100, @@ -63,6 +64,9 @@ LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1, LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1, LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1, + LLAMA3_70B_LORA_CONFIG_B200_BF16_V1, + LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1, + LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1, LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1, LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1, LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1, @@ -205,6 +209,9 @@ "LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1", "LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1", "LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1", + "LLAMA3_70B_LORA_CONFIG_B200_BF16_V1", + "LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1", + "LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1", "LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1", "LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1", "LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1", @@ -276,6 +283,7 @@ "llama3_8b_sft_config_h100", "llama3_70b_sft_config_gb200", "llama3_70b_sft_config_h100", + "llama3_70b_lora_config_b200", "llama3_70b_lora_config_gb200", "llama3_70b_lora_config_gb300", "llama3_70b_lora_config_h100", diff --git a/scripts/performance/configs/llama/llama3_llm_finetune.py b/scripts/performance/configs/llama/llama3_llm_finetune.py index eb8038e366..114df86a50 100644 --- a/scripts/performance/configs/llama/llama3_llm_finetune.py +++ b/scripts/performance/configs/llama/llama3_llm_finetune.py @@ -259,6 +259,72 @@ def llama3_70b_lora_config_gb200(precision: str = "bf16", config_variant: str = return cfg +def llama3_70b_lora_config_b300(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer: + """B300, LORA config.""" + base_cfg = get_workload_base_config( + model_family_name="llama", + model_recipe_name="llama3_70b", + task="lora", + gpu="b300", + compute_dtype=precision.upper(), + config_variant=config_variant, + ) + precision_config = get_precision_config(precision) + + cfg = llama3_70b_finetune_config( + peft="lora", + precision_config=precision_config, + packed_sequence=True, + seq_length=4096, + ) + set_llama3_common_peft_configs(cfg) + set_workload_base_configs(cfg, base_cfg) + # Enable pad_cu_seqlens for CUDA graphs compatibility with packed sequences. + # This ensures consistent cu_seqlens tensor shapes across batches, which is required + # for CUDA graphs and avoids NaN issues in attention kernels. + cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True + cfg.dataset.dataset_kwargs["pad_to_max_length"] = True + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=bool(cfg.model.tensor_model_parallel_size > 1)) + + # Override target_modules to only apply LoRA to QKV + cfg.peft.target_modules = ["linear_qkv"] + + return cfg + + +def llama3_70b_lora_config_b200(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer: + """B200, LORA config.""" + base_cfg = get_workload_base_config( + model_family_name="llama", + model_recipe_name="llama3_70b", + task="lora", + gpu="b200", + compute_dtype=precision.upper(), + config_variant=config_variant, + ) + precision_config = get_precision_config(precision) + + cfg = llama3_70b_finetune_config( + peft="lora", + precision_config=precision_config, + packed_sequence=True, + seq_length=4096, + ) + set_llama3_common_peft_configs(cfg) + set_workload_base_configs(cfg, base_cfg) + # Enable pad_cu_seqlens for CUDA graphs compatibility with packed sequences. + # This ensures consistent cu_seqlens tensor shapes across batches, which is required + # for CUDA graphs and avoids NaN issues in attention kernels. + cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True + cfg.dataset.dataset_kwargs["pad_to_max_length"] = True + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=bool(cfg.model.tensor_model_parallel_size > 1)) + + # Override target_modules to only apply LoRA to QKV + cfg.peft.target_modules = ["linear_qkv"] + + return cfg + + def llama3_70b_lora_config_h100(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer: """H100, LORA config.""" base_cfg = get_workload_base_config( diff --git a/scripts/performance/configs/llama/llama3_workload_base_configs.py b/scripts/performance/configs/llama/llama3_workload_base_configs.py index affd813dba..e5129f9069 100644 --- a/scripts/performance/configs/llama/llama3_workload_base_configs.py +++ b/scripts/performance/configs/llama/llama3_workload_base_configs.py @@ -600,6 +600,45 @@ LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1 = LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1 +_LLAMA3_70B_LORA_CONFIG_B300 = replace( + BASE_LLAMA3_70B_CONFIG, + num_gpus=8, + peft="lora", + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + context_parallel_size=1, + micro_batch_size=1, + global_batch_size=32, + cuda_graph_impl="transformer_engine", + cuda_graph_scope="mlp", +) + +LLAMA3_70B_LORA_CONFIG_B300_BF16_V1 = _LLAMA3_70B_LORA_CONFIG_B300 +LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1 = replace( + _LLAMA3_70B_LORA_CONFIG_B300, + pipeline_model_parallel_size=2, +) +LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1 = LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1 + + +_LLAMA3_70B_LORA_CONFIG_B200 = replace( + BASE_LLAMA3_70B_CONFIG, + num_gpus=8, + peft="lora", + tensor_model_parallel_size=1, + pipeline_model_parallel_size=2, + context_parallel_size=1, + micro_batch_size=1, + global_batch_size=32, + cuda_graph_impl="transformer_engine", + cuda_graph_scope="mlp", +) + +LLAMA3_70B_LORA_CONFIG_B200_BF16_V1 = _LLAMA3_70B_LORA_CONFIG_B200 +LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1 = _LLAMA3_70B_LORA_CONFIG_B200 +LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1 = _LLAMA3_70B_LORA_CONFIG_B200 + + _LLAMA3_70B_LORA_CONFIG_H100 = replace( BASE_LLAMA3_70B_CONFIG, num_gpus=8, @@ -699,6 +738,12 @@ "LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1", "LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1", "LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1", + "LLAMA3_70B_LORA_CONFIG_B300_BF16_V1", + "LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1", + "LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1", + "LLAMA3_70B_LORA_CONFIG_B200_BF16_V1", + "LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1", + "LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1", "LLAMA3_70B_LORA_CONFIG_H100_BF16_V1", "LLAMA3_70B_LORA_CONFIG_H100_FP8_CS_V1", "LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1", From d17a146f8c7a4cf29ade968115be1c4cd407661e Mon Sep 17 00:00:00 2001 From: Raghav Hrishikeshan Mukundan Date: Mon, 16 Feb 2026 18:18:30 -0800 Subject: [PATCH 2/4] Fix b300 import Signed-off-by: Raghav Hrishikeshan Mukundan --- scripts/performance/configs/llama/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py index dd9f49e832..99c6c3e2f7 100644 --- a/scripts/performance/configs/llama/__init__.py +++ b/scripts/performance/configs/llama/__init__.py @@ -67,6 +67,9 @@ LLAMA3_70B_LORA_CONFIG_B200_BF16_V1, LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1, LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1, + LLAMA3_70B_LORA_CONFIG_B300_BF16_V1, + LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1, + LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1, LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1, LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1, LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1, @@ -212,6 +215,9 @@ "LLAMA3_70B_LORA_CONFIG_B200_BF16_V1", "LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1", "LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1", + "LLAMA3_70B_LORA_CONFIG_B300_BF16_V1", + "LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1", + "LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1", "LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1", "LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1", "LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1", From 7fa8131eeccdf49bb7ea08703883e2d2875081f9 Mon Sep 17 00:00:00 2001 From: Raghav Hrishikeshan Mukundan Date: Tue, 17 Feb 2026 05:43:05 -0800 Subject: [PATCH 3/4] Fix B300 Imports Signed-off-by: Raghav Hrishikeshan Mukundan --- scripts/performance/configs/llama/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py index 99c6c3e2f7..33f390b3ba 100644 --- a/scripts/performance/configs/llama/__init__.py +++ b/scripts/performance/configs/llama/__init__.py @@ -10,6 +10,7 @@ llama3_8b_sft_config_gb200, llama3_8b_sft_config_h100, llama3_70b_lora_config_b200, + llama3_70b_lora_config_b300, llama3_70b_lora_config_gb200, llama3_70b_lora_config_gb300, llama3_70b_lora_config_h100, @@ -290,6 +291,7 @@ "llama3_70b_sft_config_gb200", "llama3_70b_sft_config_h100", "llama3_70b_lora_config_b200", + "llama3_70b_lora_config_b300", "llama3_70b_lora_config_gb200", "llama3_70b_lora_config_gb300", "llama3_70b_lora_config_h100", From dd660c470961a6aa61b7390ae292574240157919 Mon Sep 17 00:00:00 2001 From: Raghav Hrishikeshan Mukundan Date: Thu, 26 Feb 2026 22:20:48 -0800 Subject: [PATCH 4/4] Fix linter errors Signed-off-by: Raghav Hrishikeshan Mukundan --- scripts/performance/configs/llama/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py index 33f390b3ba..c70555dd4e 100644 --- a/scripts/performance/configs/llama/__init__.py +++ b/scripts/performance/configs/llama/__init__.py @@ -62,15 +62,15 @@ LLAMA3_8B_SFT_CONFIG_GB200_FP8_MX_V1, LLAMA3_8B_SFT_CONFIG_H100_BF16_V1, LLAMA3_8B_SFT_CONFIG_H100_FP8_CS_V1, - LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1, - LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1, - LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1, LLAMA3_70B_LORA_CONFIG_B200_BF16_V1, LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1, LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1, LLAMA3_70B_LORA_CONFIG_B300_BF16_V1, LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1, LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1, + LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1, + LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1, + LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1, LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1, LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1, LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1,