Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions llm/llama-4-finetuning/configs/scout_17B_16E_full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Config for multi-device full finetuning in full_finetune_distributed.py
# using a Llama4 17Bx16E MoE model
#
# This config assumes that you've run the following command before launching:
# tune download meta-llama/Llama-4-Scout-17B-16E-Instruct
#
# To launch on 8 devices, run the following command from root:
# tune run --nproc_per_node 8 full_finetune_distributed --config llama4/scout_17B_16E_full
#
# You can add specific overrides through the command line. For example, to use a larger bsz:
# tune run --nproc_per_node 8 full_finetune_distributed --config llama4/scout_17B_16E_full batch_size=8
#
# This config was only tested on 8xA100 machine and 16xH100 machines.

output_dir: /tmp/torchtune/llama4_17Bx16E/full

# Modeling arguments
model:
_component_: torchtune.models.llama4.llama4_scout_17b_16e

tensor_parallel_dim: 2 # For multi-node training we recommend tensor_parallel_dim: 8
tensor_parallel_plan:
_component_: torchtune.models.llama4.decoder_only_tp_plan
data_parallel_shard_dim: -1 # Will infer based on TP dim, effectively controls FSDP
data_parallel_replicate_dim: 1

tokenizer:
_component_: torchtune.models.llama4.llama4_transform
path: /tmp/Llama-4-Scout-17B-16E-Instruct/tokenizer.model
max_seq_len: null
max_num_tiles: 16

checkpointer:
_component_: torchtune.training.FullModelHFCheckpointer
checkpoint_dir: /tmp/Llama-4-Scout-17B-16E-Instruct
checkpoint_files:
filename_format: model-{}-of-{}.safetensors
max_filename: "00050"
recipe_checkpoint: null
output_dir: ${output_dir}
model_type: LLAMA4
resume_from_checkpoint: False

# Dataset and Sampler
dataset:
_component_: torchtune.datasets.alpaca_cleaned_dataset
packed: False # True increases speed
split: train[:95%]
seed: null
shuffle: True

# Validation
run_val_every_n_steps: null # Change to an integer to enable validation every N steps
dataset_val:
_component_: torchtune.datasets.alpaca_cleaned_dataset
split: train[95%:]
batch_size_val: ${batch_size}

# Training arguments
epochs: 1
max_steps_per_epoch: null
batch_size: 1
gradient_accumulation_steps: 1 # Use to increase effective batch size
optimizer:
_component_: torch.optim.AdamW
lr: 2e-5
fused: False
optimizer_in_bwd: False
loss:
_component_: torchtune.modules.loss.LinearCrossEntropyLoss
clip_grad_norm: null

# cuda, cpu, rocm, xpu...
device: cuda

# Memory management / performance
enable_activation_checkpointing: True
enable_activation_offloading: False
fsdp_cpu_offload: True
# compile True means use torch.compile for all components
# compile False means no torch.compile
# compile Dictionary with keys: "model", "loss", "optimizer_step"
# enables torch.compile only for specified components.
compile: False
# model: True
# loss: True
# optimizer_step: False
# scale_grads: True

# Reduced precision
dtype: bf16

# Log metrics during training
metric_logger:
_component_: torchtune.training.metric_logging.WandBLogger
# to log to disk:
# _component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True
log_level: INFO # DEBUG, WARN, etc.

# Useful for understanding how to optimize memory and performance
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False
40 changes: 25 additions & 15 deletions llm/llama-4-finetuning/llama-4-maverick-lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@
#
# Usage:
#
# HF_TOKEN=xxx sky launch llama-4-maverick-lora.yaml -c maverick --env HF_TOKEN
# HF_TOKEN=xxx sky launch llama-4-maverick-lora.yaml -c maverick --env HF_TOKEN
#
# This config requires at least 2 nodes with 8x H100 GPUs each.

envs:
HF_TOKEN:

# Required if `report_to: wandb` in `configs/llama4_lora_sft.yaml`
# secrets:
# WANDB_API_KEY:

resources:
infra: k8s
cpus: 100+
Expand Down Expand Up @@ -37,30 +41,36 @@ setup: |
conda create -n training python=3.10 -y
conda activate training

# Download the repository configuration package
# Install CUDA toolkit
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb

# Install the keyring package
sudo dpkg -i cuda-keyring_1.1-1_all.deb

# Update package list
sudo apt-get update

#sudo apt-get install cuda-minimal-build-12-6 -y
sudo apt-get install cuda-toolkit-12-6 -y

git clone -b v0.9.3 --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
# Install LLaMA Factory and dependencies
git clone https://github.com/hiyouga/LLaMA-Factory.git
cd LLaMA-Factory
git checkout 767b344
pip install -e ".[torch,metrics,deepspeed]" --no-build-isolation
pip install "transformers>=4.51.1"

pip install "transformers>=4.51.1" "huggingface_hub>=0.34.0,<1.0"
hf download meta-llama/Llama-4-Maverick-17B-128E-Instruct

run: |
conda activate training

MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
# Configure W&B if API key is set
if [ -n "$WANDB_API_KEY" ]; then
export WANDB_PROJECT=llama4-finetuning
export WANDB_NAME=llama4-run
export WANDB_RUN_ID=$SKYPILOT_TASK_ID
echo "W&B tracking enabled"
pip install wandb
fi
export FORCE_TORCHRUN=1
export MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
export MASTER_PORT=29500
export NNODES=$SKYPILOT_NUM_NODES
export NODE_RANK=$SKYPILOT_NODE_RANK
echo "Starting distributed finetuning, head node: $MASTER_ADDR"

cd LLaMA-Factory

HF_TOKEN=$HF_TOKEN FORCE_TORCHRUN=1 NNODES=$SKYPILOT_NUM_NODES NODE_RANK=$SKYPILOT_NODE_RANK MASTER_ADDR=$MASTER_ADDR MASTER_PORT=29500 llamafactory-cli train /configs/llama4_lora_sft.yaml
llamafactory-cli train /configs/llama4_lora_sft.yaml
17 changes: 14 additions & 3 deletions llm/llama-4-finetuning/llama-4-maverick-sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@
#
# Usage:
#
# HF_TOKEN=xxx sky launch llama-4-maverick-sft.yaml -c maverick --env HF_TOKEN
# HF_TOKEN=xxx sky launch llama-4-maverick-sft.yaml -c maverick --env HF_TOKEN
#
# This config requires at least 4 nodes with 8x H200 GPUs each.

envs:
HF_TOKEN:

# Required if using W&B for experiment tracking
# secrets:
# WANDB_API_KEY:

resources:
cpus: 100+
memory: 1000+
Expand Down Expand Up @@ -43,8 +47,15 @@ setup: |

run: |
conda activate training

MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
# Configure W&B if API key is set
if [ -n "$WANDB_API_KEY" ]; then
export WANDB_PROJECT=llama4-finetuning
export WANDB_NAME=llama4-maverick-sft-run
export WANDB_RUN_ID=$SKYPILOT_TASK_ID
echo "W&B tracking enabled"
pip install wandb
fi
export MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
echo "Starting distributed finetuning, head node: $MASTER_ADDR"

tune run \
Expand Down
17 changes: 14 additions & 3 deletions llm/llama-4-finetuning/llama-4-maverick.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@
#
# Usage:
#
# HF_TOKEN=xxx sky launch llama-4-maverick.yaml -c maverick --env HF_TOKEN
# HF_TOKEN=xxx sky launch llama-4-maverick.yaml -c maverick --env HF_TOKEN
#
# This config requires at least 2 nodes with 8x H200 GPUs each.

envs:
HF_TOKEN:

# Required if using W&B for experiment tracking
# secrets:
# WANDB_API_KEY:

resources:
cpus: 100+
memory: 1000+
Expand Down Expand Up @@ -43,8 +47,15 @@ setup: |

run: |
conda activate training

MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
# Configure W&B if API key is set
if [ -n "$WANDB_API_KEY" ]; then
export WANDB_PROJECT=llama4-finetuning
export WANDB_NAME=llama4-maverick-run
export WANDB_RUN_ID=$SKYPILOT_TASK_ID
echo "W&B tracking enabled"
pip install wandb
fi
export MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
echo "Starting distributed finetuning, head node: $MASTER_ADDR"

tune run \
Expand Down
36 changes: 24 additions & 12 deletions llm/llama-4-finetuning/llama-4-scout-sft.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,30 @@
# Full finetuning of Llama-4 Maverick 17B MoE model with 128 experts.
# Full finetuning of Llama-4 Scout 17B MoE model with 16 experts.
#
# Usage:
#
# HF_TOKEN=xxx sky launch llama-4-maverick.yaml -c maverick --env HF_TOKEN
# HF_TOKEN=xxx sky launch llama-4-scout-sft.yaml -c scout --env HF_TOKEN
#
# This config requires at least 2 nodes with 8x H200 GPUs each.

envs:
HF_TOKEN:
WANDB_NAME: ""

# Required if using W&B for experiment tracking
# secrets:
# WANDB_API_KEY:

resources:
cpus: 100+
memory: 1000+
accelerators: H100:8
disk_size: 1024
disk_tier: best

num_nodes: 2

workdir: .

# Optional: configure buckets for dataset and checkpoints. You can then use the
# /checkpoints directory to write checkpoints, which writes to local disk first
# and asynchronously uploads to the cloud bucket. Pass /checkpoints to the main
Expand All @@ -30,21 +38,25 @@ num_nodes: 2
# mode: MOUNT_CACHED # MOUNT_CACHED mode will intelligently cache the checkpoint for faster writes

setup: |
conda create -n training python=3.10 -y
conda activate training

# Install torch and torchtune nightly builds
pip install --pre --upgrade torch==2.8.0.dev20250610+cu126 torchvision==0.23.0.dev20250610+cu126 torchao==0.12.0.dev20250611+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126 # full options are cpu/cu118/cu124/cu126/xpu/rocm6.2/rocm6.3/rocm6.4
pip install --pre --upgrade torchtune==0.7.0.dev20250610+cpu --extra-index-url https://download.pytorch.org/whl/nightly/cpu
uv venv .venv --python 3.10
source .venv/bin/activate
uv pip install torch==2.9.0 torchvision==0.24.0 torchao==0.14.1
uv pip install git+https://github.com/meta-pytorch/torchtune.git@67ab86b94de9e7ac7dd9850113ebe69e2bbd307c

# Download the model (~200 GB, may take time to download)
tune download meta-llama/Llama-4-Scout-17B-16E-Instruct \
--hf-token $HF_TOKEN

run: |
conda activate training

MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
source .venv/bin/activate
# Configure W&B if API key is set
if [ -n "$WANDB_API_KEY" ]; then
export WANDB_NAME=${WANDB_NAME:-llama4-scout-sft-run}
export WANDB_RUN_ID=$SKYPILOT_TASK_ID
echo "W&B tracking enabled"
uv pip install wandb
fi
export MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
echo "Starting distributed finetuning, head node: $MASTER_ADDR"

tune run \
Expand All @@ -54,7 +66,7 @@ run: |
--rdzv_backend c10d \
--rdzv_endpoint=$MASTER_ADDR:29500 \
full_finetune_distributed \
--config llama4/scout_17B_16E_full \
--config configs/scout_17B_16E_full.yaml \
model_dir=/tmp/Llama-4-Scout-17B-16E-Instruct \
max_steps_per_epoch=10 \
epochs=1
Loading