diff --git a/llm/llama-4-finetuning/configs/scout_17B_16E_full.yaml b/llm/llama-4-finetuning/configs/scout_17B_16E_full.yaml new file mode 100644 index 00000000000..54fdb56032e --- /dev/null +++ b/llm/llama-4-finetuning/configs/scout_17B_16E_full.yaml @@ -0,0 +1,106 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Llama4 17Bx16E MoE model +# +# This config assumes that you've run the following command before launching: +# tune download meta-llama/Llama-4-Scout-17B-16E-Instruct +# +# To launch on 8 devices, run the following command from root: +# tune run --nproc_per_node 8 full_finetune_distributed --config llama4/scout_17B_16E_full +# +# You can add specific overrides through the command line. For example, to use a larger bsz: +# tune run --nproc_per_node 8 full_finetune_distributed --config llama4/scout_17B_16E_full batch_size=8 +# +# This config was only tested on 8xA100 machine and 16xH100 machines. + +output_dir: /tmp/torchtune/llama4_17Bx16E/full + +# Modeling arguments +model: + _component_: torchtune.models.llama4.llama4_scout_17b_16e + +tensor_parallel_dim: 2 # For multi-node training we recommend tensor_parallel_dim: 8 +tensor_parallel_plan: + _component_: torchtune.models.llama4.decoder_only_tp_plan +data_parallel_shard_dim: -1 # Will infer based on TP dim, effectively controls FSDP +data_parallel_replicate_dim: 1 + +tokenizer: + _component_: torchtune.models.llama4.llama4_transform + path: /tmp/Llama-4-Scout-17B-16E-Instruct/tokenizer.model + max_seq_len: null + max_num_tiles: 16 + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Llama-4-Scout-17B-16E-Instruct + checkpoint_files: + filename_format: model-{}-of-{}.safetensors + max_filename: "00050" + recipe_checkpoint: null + output_dir: ${output_dir} + model_type: LLAMA4 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed + split: train[:95%] +seed: null +shuffle: True + +# Validation +run_val_every_n_steps: null # Change to an integer to enable validation every N steps +dataset_val: + _component_: torchtune.datasets.alpaca_cleaned_dataset + split: train[95%:] +batch_size_val: ${batch_size} + +# Training arguments +epochs: 1 +max_steps_per_epoch: null +batch_size: 1 +gradient_accumulation_steps: 1 # Use to increase effective batch size +optimizer: + _component_: torch.optim.AdamW + lr: 2e-5 + fused: False +optimizer_in_bwd: False +loss: + _component_: torchtune.modules.loss.LinearCrossEntropyLoss +clip_grad_norm: null + +# cuda, cpu, rocm, xpu... +device: cuda + +# Memory management / performance +enable_activation_checkpointing: True +enable_activation_offloading: False +fsdp_cpu_offload: True +# compile True means use torch.compile for all components +# compile False means no torch.compile +# compile Dictionary with keys: "model", "loss", "optimizer_step" +# enables torch.compile only for specified components. +compile: False +# model: True +# loss: True +# optimizer_step: False +# scale_grads: True + +# Reduced precision +dtype: bf16 + +# Log metrics during training +metric_logger: + _component_: torchtune.training.metric_logging.WandBLogger + # to log to disk: + # _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir}/logs +log_every_n_steps: 1 +log_peak_memory_stats: True +log_level: INFO # DEBUG, WARN, etc. + +# Useful for understanding how to optimize memory and performance +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False diff --git a/llm/llama-4-finetuning/llama-4-maverick-lora.yaml b/llm/llama-4-finetuning/llama-4-maverick-lora.yaml index b2bbd60e7dd..8d4d96e28e4 100644 --- a/llm/llama-4-finetuning/llama-4-maverick-lora.yaml +++ b/llm/llama-4-finetuning/llama-4-maverick-lora.yaml @@ -2,13 +2,17 @@ # # Usage: # -# HF_TOKEN=xxx sky launch llama-4-maverick-lora.yaml -c maverick --env HF_TOKEN +# HF_TOKEN=xxx sky launch llama-4-maverick-lora.yaml -c maverick --env HF_TOKEN # # This config requires at least 2 nodes with 8x H100 GPUs each. envs: HF_TOKEN: +# Required if `report_to: wandb` in `configs/llama4_lora_sft.yaml` +# secrets: +# WANDB_API_KEY: + resources: infra: k8s cpus: 100+ @@ -37,30 +41,36 @@ setup: | conda create -n training python=3.10 -y conda activate training - # Download the repository configuration package + # Install CUDA toolkit wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb - - # Install the keyring package sudo dpkg -i cuda-keyring_1.1-1_all.deb - - # Update package list sudo apt-get update - - #sudo apt-get install cuda-minimal-build-12-6 -y sudo apt-get install cuda-toolkit-12-6 -y - git clone -b v0.9.3 --depth 1 https://github.com/hiyouga/LLaMA-Factory.git + # Install LLaMA Factory and dependencies + git clone https://github.com/hiyouga/LLaMA-Factory.git cd LLaMA-Factory + git checkout 767b344 pip install -e ".[torch,metrics,deepspeed]" --no-build-isolation - pip install "transformers>=4.51.1" - + pip install "transformers>=4.51.1" "huggingface_hub>=0.34.0,<1.0" + hf download meta-llama/Llama-4-Maverick-17B-128E-Instruct run: | conda activate training - - MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + # Configure W&B if API key is set + if [ -n "$WANDB_API_KEY" ]; then + export WANDB_PROJECT=llama4-finetuning + export WANDB_NAME=llama4-run + export WANDB_RUN_ID=$SKYPILOT_TASK_ID + echo "W&B tracking enabled" + pip install wandb + fi + export FORCE_TORCHRUN=1 + export MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + export MASTER_PORT=29500 + export NNODES=$SKYPILOT_NUM_NODES + export NODE_RANK=$SKYPILOT_NODE_RANK echo "Starting distributed finetuning, head node: $MASTER_ADDR" cd LLaMA-Factory - - HF_TOKEN=$HF_TOKEN FORCE_TORCHRUN=1 NNODES=$SKYPILOT_NUM_NODES NODE_RANK=$SKYPILOT_NODE_RANK MASTER_ADDR=$MASTER_ADDR MASTER_PORT=29500 llamafactory-cli train /configs/llama4_lora_sft.yaml + llamafactory-cli train /configs/llama4_lora_sft.yaml \ No newline at end of file diff --git a/llm/llama-4-finetuning/llama-4-maverick-sft.yaml b/llm/llama-4-finetuning/llama-4-maverick-sft.yaml index 6828d232ef4..92b36f3ef5b 100644 --- a/llm/llama-4-finetuning/llama-4-maverick-sft.yaml +++ b/llm/llama-4-finetuning/llama-4-maverick-sft.yaml @@ -2,13 +2,17 @@ # # Usage: # -# HF_TOKEN=xxx sky launch llama-4-maverick-sft.yaml -c maverick --env HF_TOKEN +# HF_TOKEN=xxx sky launch llama-4-maverick-sft.yaml -c maverick --env HF_TOKEN # # This config requires at least 4 nodes with 8x H200 GPUs each. envs: HF_TOKEN: +# Required if using W&B for experiment tracking +# secrets: +# WANDB_API_KEY: + resources: cpus: 100+ memory: 1000+ @@ -43,8 +47,15 @@ setup: | run: | conda activate training - - MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + # Configure W&B if API key is set + if [ -n "$WANDB_API_KEY" ]; then + export WANDB_PROJECT=llama4-finetuning + export WANDB_NAME=llama4-maverick-sft-run + export WANDB_RUN_ID=$SKYPILOT_TASK_ID + echo "W&B tracking enabled" + pip install wandb + fi + export MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) echo "Starting distributed finetuning, head node: $MASTER_ADDR" tune run \ diff --git a/llm/llama-4-finetuning/llama-4-maverick.yaml b/llm/llama-4-finetuning/llama-4-maverick.yaml index 301af16b728..81f65840477 100644 --- a/llm/llama-4-finetuning/llama-4-maverick.yaml +++ b/llm/llama-4-finetuning/llama-4-maverick.yaml @@ -2,13 +2,17 @@ # # Usage: # -# HF_TOKEN=xxx sky launch llama-4-maverick.yaml -c maverick --env HF_TOKEN +# HF_TOKEN=xxx sky launch llama-4-maverick.yaml -c maverick --env HF_TOKEN # # This config requires at least 2 nodes with 8x H200 GPUs each. envs: HF_TOKEN: +# Required if using W&B for experiment tracking +# secrets: +# WANDB_API_KEY: + resources: cpus: 100+ memory: 1000+ @@ -43,8 +47,15 @@ setup: | run: | conda activate training - - MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + # Configure W&B if API key is set + if [ -n "$WANDB_API_KEY" ]; then + export WANDB_PROJECT=llama4-finetuning + export WANDB_NAME=llama4-maverick-run + export WANDB_RUN_ID=$SKYPILOT_TASK_ID + echo "W&B tracking enabled" + pip install wandb + fi + export MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) echo "Starting distributed finetuning, head node: $MASTER_ADDR" tune run \ diff --git a/llm/llama-4-finetuning/llama-4-scout-sft.yaml b/llm/llama-4-finetuning/llama-4-scout-sft.yaml index 301ca54095e..8bdc08e8a13 100644 --- a/llm/llama-4-finetuning/llama-4-scout-sft.yaml +++ b/llm/llama-4-finetuning/llama-4-scout-sft.yaml @@ -1,22 +1,30 @@ -# Full finetuning of Llama-4 Maverick 17B MoE model with 128 experts. +# Full finetuning of Llama-4 Scout 17B MoE model with 16 experts. # # Usage: # -# HF_TOKEN=xxx sky launch llama-4-maverick.yaml -c maverick --env HF_TOKEN +# HF_TOKEN=xxx sky launch llama-4-scout-sft.yaml -c scout --env HF_TOKEN # # This config requires at least 2 nodes with 8x H200 GPUs each. envs: HF_TOKEN: + WANDB_NAME: "" + +# Required if using W&B for experiment tracking +# secrets: +# WANDB_API_KEY: resources: cpus: 100+ memory: 1000+ accelerators: H100:8 + disk_size: 1024 disk_tier: best num_nodes: 2 +workdir: . + # Optional: configure buckets for dataset and checkpoints. You can then use the # /checkpoints directory to write checkpoints, which writes to local disk first # and asynchronously uploads to the cloud bucket. Pass /checkpoints to the main @@ -30,21 +38,25 @@ num_nodes: 2 # mode: MOUNT_CACHED # MOUNT_CACHED mode will intelligently cache the checkpoint for faster writes setup: | - conda create -n training python=3.10 -y - conda activate training - - # Install torch and torchtune nightly builds - pip install --pre --upgrade torch==2.8.0.dev20250610+cu126 torchvision==0.23.0.dev20250610+cu126 torchao==0.12.0.dev20250611+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126 # full options are cpu/cu118/cu124/cu126/xpu/rocm6.2/rocm6.3/rocm6.4 - pip install --pre --upgrade torchtune==0.7.0.dev20250610+cpu --extra-index-url https://download.pytorch.org/whl/nightly/cpu + uv venv .venv --python 3.10 + source .venv/bin/activate + uv pip install torch==2.9.0 torchvision==0.24.0 torchao==0.14.1 + uv pip install git+https://github.com/meta-pytorch/torchtune.git@67ab86b94de9e7ac7dd9850113ebe69e2bbd307c # Download the model (~200 GB, may take time to download) tune download meta-llama/Llama-4-Scout-17B-16E-Instruct \ --hf-token $HF_TOKEN run: | - conda activate training - - MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + source .venv/bin/activate + # Configure W&B if API key is set + if [ -n "$WANDB_API_KEY" ]; then + export WANDB_NAME=${WANDB_NAME:-llama4-scout-sft-run} + export WANDB_RUN_ID=$SKYPILOT_TASK_ID + echo "W&B tracking enabled" + uv pip install wandb + fi + export MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) echo "Starting distributed finetuning, head node: $MASTER_ADDR" tune run \ @@ -54,7 +66,7 @@ run: | --rdzv_backend c10d \ --rdzv_endpoint=$MASTER_ADDR:29500 \ full_finetune_distributed \ - --config llama4/scout_17B_16E_full \ + --config configs/scout_17B_16E_full.yaml \ model_dir=/tmp/Llama-4-Scout-17B-16E-Instruct \ max_steps_per_epoch=10 \ epochs=1