diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py index 61f6c6bc93..c70555dd4e 100644 --- a/scripts/performance/configs/llama/__init__.py +++ b/scripts/performance/configs/llama/__init__.py @@ -9,6 +9,8 @@ from .llama3_llm_finetune import ( llama3_8b_sft_config_gb200, llama3_8b_sft_config_h100, + llama3_70b_lora_config_b200, + llama3_70b_lora_config_b300, llama3_70b_lora_config_gb200, llama3_70b_lora_config_gb300, llama3_70b_lora_config_h100, @@ -60,6 +62,12 @@ LLAMA3_8B_SFT_CONFIG_GB200_FP8_MX_V1, LLAMA3_8B_SFT_CONFIG_H100_BF16_V1, LLAMA3_8B_SFT_CONFIG_H100_FP8_CS_V1, + LLAMA3_70B_LORA_CONFIG_B200_BF16_V1, + LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1, + LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1, + LLAMA3_70B_LORA_CONFIG_B300_BF16_V1, + LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1, + LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1, LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1, LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1, LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1, @@ -205,6 +213,12 @@ "LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1", "LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1", "LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1", + "LLAMA3_70B_LORA_CONFIG_B200_BF16_V1", + "LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1", + "LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1", + "LLAMA3_70B_LORA_CONFIG_B300_BF16_V1", + "LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1", + "LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1", "LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1", "LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1", "LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1", @@ -276,6 +290,8 @@ "llama3_8b_sft_config_h100", "llama3_70b_sft_config_gb200", "llama3_70b_sft_config_h100", + "llama3_70b_lora_config_b200", + "llama3_70b_lora_config_b300", "llama3_70b_lora_config_gb200", "llama3_70b_lora_config_gb300", "llama3_70b_lora_config_h100", diff --git a/scripts/performance/configs/llama/llama3_llm_finetune.py b/scripts/performance/configs/llama/llama3_llm_finetune.py index 0464d7b3bc..39f458ad70 100644 --- a/scripts/performance/configs/llama/llama3_llm_finetune.py +++ b/scripts/performance/configs/llama/llama3_llm_finetune.py @@ -262,6 +262,72 @@ def llama3_70b_lora_config_gb200(precision: str = "bf16", config_variant: str = return cfg +def llama3_70b_lora_config_b300(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer: + """B300, LORA config.""" + base_cfg = get_workload_base_config( + model_family_name="llama", + model_recipe_name="llama3_70b", + task="lora", + gpu="b300", + compute_dtype=precision.upper(), + config_variant=config_variant, + ) + precision_config = get_precision_config(precision) + + cfg = llama3_70b_finetune_config( + peft="lora", + precision_config=precision_config, + packed_sequence=True, + seq_length=4096, + ) + set_llama3_common_peft_configs(cfg) + set_workload_base_configs(cfg, base_cfg) + # Enable pad_cu_seqlens for CUDA graphs compatibility with packed sequences. + # This ensures consistent cu_seqlens tensor shapes across batches, which is required + # for CUDA graphs and avoids NaN issues in attention kernels. + cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True + cfg.dataset.dataset_kwargs["pad_to_max_length"] = True + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=bool(cfg.model.tensor_model_parallel_size > 1)) + + # Override target_modules to only apply LoRA to QKV + cfg.peft.target_modules = ["linear_qkv"] + + return cfg + + +def llama3_70b_lora_config_b200(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer: + """B200, LORA config.""" + base_cfg = get_workload_base_config( + model_family_name="llama", + model_recipe_name="llama3_70b", + task="lora", + gpu="b200", + compute_dtype=precision.upper(), + config_variant=config_variant, + ) + precision_config = get_precision_config(precision) + + cfg = llama3_70b_finetune_config( + peft="lora", + precision_config=precision_config, + packed_sequence=True, + seq_length=4096, + ) + set_llama3_common_peft_configs(cfg) + set_workload_base_configs(cfg, base_cfg) + # Enable pad_cu_seqlens for CUDA graphs compatibility with packed sequences. + # This ensures consistent cu_seqlens tensor shapes across batches, which is required + # for CUDA graphs and avoids NaN issues in attention kernels. + cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True + cfg.dataset.dataset_kwargs["pad_to_max_length"] = True + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=bool(cfg.model.tensor_model_parallel_size > 1)) + + # Override target_modules to only apply LoRA to QKV + cfg.peft.target_modules = ["linear_qkv"] + + return cfg + + def llama3_70b_lora_config_h100(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer: """H100, LORA config.""" base_cfg = get_workload_base_config( diff --git a/scripts/performance/configs/llama/llama3_workload_base_configs.py b/scripts/performance/configs/llama/llama3_workload_base_configs.py index 0f60d5d057..4f8a7e0fb4 100644 --- a/scripts/performance/configs/llama/llama3_workload_base_configs.py +++ b/scripts/performance/configs/llama/llama3_workload_base_configs.py @@ -606,6 +606,45 @@ LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1 = LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1 +_LLAMA3_70B_LORA_CONFIG_B300 = replace( + BASE_LLAMA3_70B_CONFIG, + num_gpus=8, + peft="lora", + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + context_parallel_size=1, + micro_batch_size=1, + global_batch_size=32, + cuda_graph_impl="transformer_engine", + cuda_graph_scope="mlp", +) + +LLAMA3_70B_LORA_CONFIG_B300_BF16_V1 = _LLAMA3_70B_LORA_CONFIG_B300 +LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1 = replace( + _LLAMA3_70B_LORA_CONFIG_B300, + pipeline_model_parallel_size=2, +) +LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1 = LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1 + + +_LLAMA3_70B_LORA_CONFIG_B200 = replace( + BASE_LLAMA3_70B_CONFIG, + num_gpus=8, + peft="lora", + tensor_model_parallel_size=1, + pipeline_model_parallel_size=2, + context_parallel_size=1, + micro_batch_size=1, + global_batch_size=32, + cuda_graph_impl="transformer_engine", + cuda_graph_scope="mlp", +) + +LLAMA3_70B_LORA_CONFIG_B200_BF16_V1 = _LLAMA3_70B_LORA_CONFIG_B200 +LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1 = _LLAMA3_70B_LORA_CONFIG_B200 +LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1 = _LLAMA3_70B_LORA_CONFIG_B200 + + _LLAMA3_70B_LORA_CONFIG_H100 = replace( BASE_LLAMA3_70B_CONFIG, num_gpus=8, @@ -705,6 +744,12 @@ "LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1", "LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1", "LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1", + "LLAMA3_70B_LORA_CONFIG_B300_BF16_V1", + "LLAMA3_70B_LORA_CONFIG_B300_FP8_CS_V1", + "LLAMA3_70B_LORA_CONFIG_B300_FP8_MX_V1", + "LLAMA3_70B_LORA_CONFIG_B200_BF16_V1", + "LLAMA3_70B_LORA_CONFIG_B200_FP8_CS_V1", + "LLAMA3_70B_LORA_CONFIG_B200_FP8_MX_V1", "LLAMA3_70B_LORA_CONFIG_H100_BF16_V1", "LLAMA3_70B_LORA_CONFIG_H100_FP8_CS_V1", "LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1",