skypilot-org · alex000kim · Oct 30, 2025 · Oct 31, 2025 · Oct 31, 2025 · Nov 2, 2025
diff --git a/llm/llama-4-finetuning/configs/scout_17B_16E_full.yaml b/llm/llama-4-finetuning/configs/scout_17B_16E_full.yaml
@@ -0,0 +1,106 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Llama4 17Bx16E MoE model
+#
+# This config assumes that you've run the following command before launching:
+#   tune download meta-llama/Llama-4-Scout-17B-16E-Instruct
+#
+# To launch on 8 devices, run the following command from root:
+#   tune run --nproc_per_node 8 full_finetune_distributed --config llama4/scout_17B_16E_full
+#
+# You can add specific overrides through the command line. For example, to use a larger bsz:
+#   tune run --nproc_per_node 8 full_finetune_distributed --config llama4/scout_17B_16E_full batch_size=8
+#
+# This config was only tested on 8xA100 machine and 16xH100 machines.
+
+output_dir: /tmp/torchtune/llama4_17Bx16E/full
+
+# Modeling arguments
+model:
+  _component_: torchtune.models.llama4.llama4_scout_17b_16e
+
+tensor_parallel_dim: 2 # For multi-node training we recommend tensor_parallel_dim: 8
+tensor_parallel_plan:
+  _component_: torchtune.models.llama4.decoder_only_tp_plan
+data_parallel_shard_dim: -1 # Will infer based on TP dim, effectively controls FSDP
+data_parallel_replicate_dim: 1
+
+tokenizer:
+  _component_: torchtune.models.llama4.llama4_transform
+  path: /tmp/Llama-4-Scout-17B-16E-Instruct/tokenizer.model
+  max_seq_len: null
+  max_num_tiles: 16
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Llama-4-Scout-17B-16E-Instruct
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00050"
+  recipe_checkpoint: null
+  output_dir: ${output_dir}
+  model_type: LLAMA4
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+  split: train[:95%]
+seed: null
+shuffle: True
+
+# Validation
+run_val_every_n_steps: null  # Change to an integer to enable validation every N steps
+dataset_val:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  split: train[95%:]
+batch_size_val: ${batch_size}
+
+# Training arguments
+epochs: 1
+max_steps_per_epoch: null
+batch_size: 1
+gradient_accumulation_steps: 1 # Use to increase effective batch size
+optimizer:
+  _component_: torch.optim.AdamW
+  lr: 2e-5
+  fused: False
+optimizer_in_bwd: False
+loss:
+  _component_: torchtune.modules.loss.LinearCrossEntropyLoss
+clip_grad_norm: null
+
+# cuda, cpu, rocm, xpu...
+device: cuda
+
+# Memory management / performance
+enable_activation_checkpointing: True
+enable_activation_offloading: False
+fsdp_cpu_offload: True
+# compile True means use torch.compile for all components
+# compile False means no torch.compile
+# compile Dictionary with keys: "model", "loss", "optimizer_step"
+# enables torch.compile only for specified components.
+compile: False
+#    model: True
+#    loss: True
+#    optimizer_step: False
+#    scale_grads: True
+
+# Reduced precision
+dtype: bf16
+
+# Log metrics during training
+metric_logger:
+  _component_: torchtune.training.metric_logging.WandBLogger
+  # to log to disk: 
+  # _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}/logs
+log_every_n_steps: 1
+log_peak_memory_stats: True
+log_level: INFO  # DEBUG, WARN, etc.
+
+# Useful for understanding how to optimize memory and performance
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
diff --git a/llm/llama-4-finetuning/llama-4-maverick-lora.yaml b/llm/llama-4-finetuning/llama-4-maverick-lora.yaml
@@ -2,13 +2,17 @@
 #
 # Usage:
 #
-#  HF_TOKEN=xxx sky launch llama-4-maverick-lora.yaml -c maverick --env HF_TOKEN
+# HF_TOKEN=xxx sky launch llama-4-maverick-lora.yaml -c maverick --env HF_TOKEN
 #
 # This config requires at least 2 nodes with 8x H100 GPUs each.
 
 envs:
   HF_TOKEN:
 
+# Required if `report_to: wandb` in `configs/llama4_lora_sft.yaml`
+# secrets:
+#   WANDB_API_KEY: 
+
 resources:
   infra: k8s
   cpus: 100+
@@ -37,30 +41,36 @@ setup: |
   conda create -n training python=3.10 -y
   conda activate training
 
-  # Download the repository configuration package
+  # Install CUDA toolkit
   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-
-  # Install the keyring package
   sudo dpkg -i cuda-keyring_1.1-1_all.deb
-
-  # Update package list
   sudo apt-get update
-
-  #sudo apt-get install cuda-minimal-build-12-6 -y
   sudo apt-get install cuda-toolkit-12-6 -y
 
-  git clone -b v0.9.3 --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
+  # Install LLaMA Factory and dependencies
+  git clone https://github.com/hiyouga/LLaMA-Factory.git
   cd LLaMA-Factory
+  git checkout 767b344
   pip install -e ".[torch,metrics,deepspeed]" --no-build-isolation
-  pip install "transformers>=4.51.1"
-
+  pip install "transformers>=4.51.1" "huggingface_hub>=0.34.0,<1.0"
+  hf download meta-llama/Llama-4-Maverick-17B-128E-Instruct
 
 run: |
   conda activate training
-
-  MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
+  # Configure W&B if API key is set
+  if [ -n "$WANDB_API_KEY" ]; then
+    export WANDB_PROJECT=llama4-finetuning
+    export WANDB_NAME=llama4-run
+    export WANDB_RUN_ID=$SKYPILOT_TASK_ID
+    echo "W&B tracking enabled"
+    pip install wandb
+  fi
+  export FORCE_TORCHRUN=1
+  export MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
+  export MASTER_PORT=29500
+  export NNODES=$SKYPILOT_NUM_NODES
+  export NODE_RANK=$SKYPILOT_NODE_RANK
   echo "Starting distributed finetuning, head node: $MASTER_ADDR"
 
   cd LLaMA-Factory
-
-  HF_TOKEN=$HF_TOKEN FORCE_TORCHRUN=1 NNODES=$SKYPILOT_NUM_NODES NODE_RANK=$SKYPILOT_NODE_RANK MASTER_ADDR=$MASTER_ADDR MASTER_PORT=29500 llamafactory-cli train /configs/llama4_lora_sft.yaml
+  llamafactory-cli train /configs/llama4_lora_sft.yaml
diff --git a/llm/llama-4-finetuning/llama-4-maverick-sft.yaml b/llm/llama-4-finetuning/llama-4-maverick-sft.yaml
@@ -2,13 +2,17 @@
 #
 # Usage:
 #
-#  HF_TOKEN=xxx sky launch llama-4-maverick-sft.yaml -c maverick --env HF_TOKEN
+# HF_TOKEN=xxx sky launch llama-4-maverick-sft.yaml -c maverick --env HF_TOKEN
 #
 # This config requires at least 4 nodes with 8x H200 GPUs each.
 
 envs:
   HF_TOKEN:
 
+# Required if using W&B for experiment tracking
+# secrets:
+#   WANDB_API_KEY:
+
 resources:
   cpus: 100+
   memory: 1000+
@@ -43,8 +47,15 @@ setup: |
 
 run: |
   conda activate training
-
-  MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
+  # Configure W&B if API key is set
+  if [ -n "$WANDB_API_KEY" ]; then
+    export WANDB_PROJECT=llama4-finetuning
+    export WANDB_NAME=llama4-maverick-sft-run
+    export WANDB_RUN_ID=$SKYPILOT_TASK_ID
+    echo "W&B tracking enabled"
+    pip install wandb
+  fi
+  export MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
   echo "Starting distributed finetuning, head node: $MASTER_ADDR"
 
   tune run \

diff --git a/llm/llama-4-finetuning/llama-4-maverick.yaml b/llm/llama-4-finetuning/llama-4-maverick.yaml
@@ -2,13 +2,17 @@
 #
 # Usage:
 #
-#  HF_TOKEN=xxx sky launch llama-4-maverick.yaml -c maverick --env HF_TOKEN
+# HF_TOKEN=xxx sky launch llama-4-maverick.yaml -c maverick --env HF_TOKEN
 #
 # This config requires at least 2 nodes with 8x H200 GPUs each.
 
 envs:
   HF_TOKEN:
 
+# Required if using W&B for experiment tracking
+# secrets:
+#   WANDB_API_KEY:
+
 resources:
   cpus: 100+
   memory: 1000+
@@ -43,8 +47,15 @@ setup: |
 
 run: |
   conda activate training
-
-  MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
+  # Configure W&B if API key is set
+  if [ -n "$WANDB_API_KEY" ]; then
+    export WANDB_PROJECT=llama4-finetuning
+    export WANDB_NAME=llama4-maverick-run
+    export WANDB_RUN_ID=$SKYPILOT_TASK_ID
+    echo "W&B tracking enabled"
+    pip install wandb
+  fi
+  export MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
   echo "Starting distributed finetuning, head node: $MASTER_ADDR"
 
   tune run \

diff --git a/llm/llama-4-finetuning/llama-4-scout-sft.yaml b/llm/llama-4-finetuning/llama-4-scout-sft.yaml
@@ -1,22 +1,30 @@
-# Full finetuning of Llama-4 Maverick 17B MoE model with 128 experts.
+# Full finetuning of Llama-4 Scout 17B MoE model with 16 experts.
 #
 # Usage:
 #
-#  HF_TOKEN=xxx sky launch llama-4-maverick.yaml -c maverick --env HF_TOKEN
+# HF_TOKEN=xxx sky launch llama-4-scout-sft.yaml -c scout --env HF_TOKEN
 #
 # This config requires at least 2 nodes with 8x H200 GPUs each.
 
 envs:
   HF_TOKEN:
+  WANDB_NAME: ""
+
+# Required if using W&B for experiment tracking
+# secrets:
+#   WANDB_API_KEY:
 
 resources:
   cpus: 100+
   memory: 1000+
   accelerators: H100:8
+  disk_size: 1024
   disk_tier: best
 
 num_nodes: 2
 
+workdir: .
+
 # Optional: configure buckets for dataset and checkpoints. You can then use the
 # /checkpoints directory to write checkpoints, which writes to local disk first
 # and asynchronously uploads to the cloud bucket. Pass /checkpoints to the main
@@ -30,21 +38,25 @@ num_nodes: 2
 #    mode: MOUNT_CACHED  # MOUNT_CACHED mode will intelligently cache the checkpoint for faster writes
 
 setup: |
-  conda create -n training python=3.10 -y
-  conda activate training
-
-  # Install torch and torchtune nightly builds
-  pip install --pre --upgrade torch==2.8.0.dev20250610+cu126 torchvision==0.23.0.dev20250610+cu126 torchao==0.12.0.dev20250611+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126 # full options are cpu/cu118/cu124/cu126/xpu/rocm6.2/rocm6.3/rocm6.4
-  pip install --pre --upgrade torchtune==0.7.0.dev20250610+cpu --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  uv venv .venv --python 3.10
+  source .venv/bin/activate
+  uv pip install torch==2.9.0 torchvision==0.24.0 torchao==0.14.1
+  uv pip install git+https://github.com/meta-pytorch/torchtune.git@67ab86b94de9e7ac7dd9850113ebe69e2bbd307c
 
   # Download the model (~200 GB, may take time to download)
   tune download meta-llama/Llama-4-Scout-17B-16E-Instruct \
     --hf-token $HF_TOKEN
 
 run: |
-  conda activate training
-
-  MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
+  source .venv/bin/activate
+  # Configure W&B if API key is set
+  if [ -n "$WANDB_API_KEY" ]; then
+    export WANDB_NAME=${WANDB_NAME:-llama4-scout-sft-run}
+    export WANDB_RUN_ID=$SKYPILOT_TASK_ID
+    echo "W&B tracking enabled"
+    uv pip install wandb
+  fi
+  export MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
   echo "Starting distributed finetuning, head node: $MASTER_ADDR"
 
   tune run \
@@ -54,7 +66,7 @@ run: |
   --rdzv_backend c10d \
   --rdzv_endpoint=$MASTER_ADDR:29500 \
   full_finetune_distributed \
-  --config llama4/scout_17B_16E_full \
+  --config configs/scout_17B_16E_full.yaml \
   model_dir=/tmp/Llama-4-Scout-17B-16E-Instruct \
   max_steps_per_epoch=10 \
   epochs=1