Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions scripts/performance/configs/llama/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .llama3_llm_finetune import (
llama3_8b_sft_config_gb200,
llama3_8b_sft_config_h100,
llama3_70b_lora_config_b200,
llama3_70b_lora_config_gb200,
llama3_70b_lora_config_gb300,
llama3_70b_lora_config_h100,
Expand Down Expand Up @@ -63,6 +64,12 @@
LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1,
LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1,
LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1,
LLAMA3_70B_LORA_CONFIG_B200_BF16_V1,
LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1,
LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1,
LLAMA3_70B_LORA_CONFIG_B300_BF16_V1,
LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1,
LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1,
LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1,
LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1,
LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1,
Expand Down Expand Up @@ -205,6 +212,12 @@
"LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1",
"LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1",
"LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1",
"LLAMA3_70B_LORA_CONFIG_B200_BF16_V1",
"LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1",
"LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1",
"LLAMA3_70B_LORA_CONFIG_B300_BF16_V1",
"LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1",
"LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1",
"LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1",
"LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1",
"LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1",
Expand Down Expand Up @@ -276,6 +289,7 @@
"llama3_8b_sft_config_h100",
"llama3_70b_sft_config_gb200",
"llama3_70b_sft_config_h100",
"llama3_70b_lora_config_b200",
"llama3_70b_lora_config_gb200",
"llama3_70b_lora_config_gb300",
"llama3_70b_lora_config_h100",
Expand Down
66 changes: 66 additions & 0 deletions scripts/performance/configs/llama/llama3_llm_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,72 @@ def llama3_70b_lora_config_gb200(precision: str = "bf16", config_variant: str =
return cfg


def llama3_70b_lora_config_b300(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
"""B300, LORA config."""
base_cfg = get_workload_base_config(
model_family_name="llama",
model_recipe_name="llama3_70b",
task="lora",
gpu="b300",
compute_dtype=precision.upper(),
config_variant=config_variant,
)
precision_config = get_precision_config(precision)

cfg = llama3_70b_finetune_config(
peft="lora",
precision_config=precision_config,
packed_sequence=True,
seq_length=4096,
)
set_llama3_common_peft_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
# Enable pad_cu_seqlens for CUDA graphs compatibility with packed sequences.
# This ensures consistent cu_seqlens tensor shapes across batches, which is required
# for CUDA graphs and avoids NaN issues in attention kernels.
cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True
cfg.dataset.dataset_kwargs["pad_to_max_length"] = True
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=bool(cfg.model.tensor_model_parallel_size > 1))

# Override target_modules to only apply LoRA to QKV
cfg.peft.target_modules = ["linear_qkv"]

return cfg


def llama3_70b_lora_config_b200(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
"""B200, LORA config."""
base_cfg = get_workload_base_config(
model_family_name="llama",
model_recipe_name="llama3_70b",
task="lora",
gpu="b200",
compute_dtype=precision.upper(),
config_variant=config_variant,
)
precision_config = get_precision_config(precision)

cfg = llama3_70b_finetune_config(
peft="lora",
precision_config=precision_config,
packed_sequence=True,
seq_length=4096,
)
set_llama3_common_peft_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
# Enable pad_cu_seqlens for CUDA graphs compatibility with packed sequences.
# This ensures consistent cu_seqlens tensor shapes across batches, which is required
# for CUDA graphs and avoids NaN issues in attention kernels.
cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True
cfg.dataset.dataset_kwargs["pad_to_max_length"] = True
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=bool(cfg.model.tensor_model_parallel_size > 1))

# Override target_modules to only apply LoRA to QKV
cfg.peft.target_modules = ["linear_qkv"]

return cfg


def llama3_70b_lora_config_h100(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
"""H100, LORA config."""
base_cfg = get_workload_base_config(
Expand Down
45 changes: 45 additions & 0 deletions scripts/performance/configs/llama/llama3_workload_base_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,45 @@
LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1 = LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1


_LLAMA3_70B_LORA_CONFIG_B300 = replace(
BASE_LLAMA3_70B_CONFIG,
num_gpus=8,
peft="lora",
tensor_model_parallel_size=1,
pipeline_model_parallel_size=1,
context_parallel_size=1,
micro_batch_size=1,
global_batch_size=32,
cuda_graph_impl="transformer_engine",
cuda_graph_scope="mlp",
)

LLAMA3_70B_LORA_CONFIG_B300_BF16_V1 = _LLAMA3_70B_LORA_CONFIG_B300
LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1 = replace(
_LLAMA3_70B_LORA_CONFIG_B300,
pipeline_model_parallel_size=2,
)
LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1 = LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1


_LLAMA3_70B_LORA_CONFIG_B200 = replace(
BASE_LLAMA3_70B_CONFIG,
num_gpus=8,
peft="lora",
tensor_model_parallel_size=1,
pipeline_model_parallel_size=2,
context_parallel_size=1,
micro_batch_size=1,
global_batch_size=32,
cuda_graph_impl="transformer_engine",
cuda_graph_scope="mlp",
)

LLAMA3_70B_LORA_CONFIG_B200_BF16_V1 = _LLAMA3_70B_LORA_CONFIG_B200
LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1 = _LLAMA3_70B_LORA_CONFIG_B200
LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1 = _LLAMA3_70B_LORA_CONFIG_B200


_LLAMA3_70B_LORA_CONFIG_H100 = replace(
BASE_LLAMA3_70B_CONFIG,
num_gpus=8,
Expand Down Expand Up @@ -699,6 +738,12 @@
"LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1",
"LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1",
"LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1",
"LLAMA3_70B_LORA_CONFIG_B300_BF16_V1",
"LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1",
"LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1",
"LLAMA3_70B_LORA_CONFIG_B200_BF16_V1",
"LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1",
"LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1",
"LLAMA3_70B_LORA_CONFIG_H100_BF16_V1",
"LLAMA3_70B_LORA_CONFIG_H100_FP8_CS_V1",
"LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1",
Expand Down