diff --git a/llm/gpt-oss-finetuning/README.md b/llm/gpt-oss-finetuning/README.md index 30e52468494..8cd995821a2 100644 --- a/llm/gpt-oss-finetuning/README.md +++ b/llm/gpt-oss-finetuning/README.md @@ -29,6 +29,18 @@ For more details on how to setup your cloud credentials see [SkyPilot docs](http sky check ``` +### Configure checkpoint storage (Optional) + +Checkpoint storage is optional and only needed if you want to resume training from interruptions. By default, checkpoints are saved locally on the cluster. + +To enable checkpoint persistence across cluster restarts, uncomment and configure the S3 bucket in the YAML files: + +```yaml +file_mounts: + /checkpoints: + source: s3://my-skypilot-bucket # change this to your bucket +``` + ## Step 1: Run gpt-oss models ### Full finetuning @@ -53,6 +65,13 @@ resources: file_mounts: /sft: ./sft + /checkpoints: + source: s3://my-skypilot-bucket # change this to your bucket + +envs: + WANDB_PROJECT: gpt-oss-120b-sft + WANDB_RESUME: allow + WANDB_API_KEY: "" # optionally, enable WandB tracking by providing the API key num_nodes: 4 @@ -64,10 +83,13 @@ setup: | uv pip install "trl>=0.20.0" "peft>=0.17.0" "transformers>=4.55.0" uv pip install deepspeed uv pip install git+https://github.com/huggingface/accelerate.git@c0a3aefea8aa5008a0fbf55b049bd3f0efa9cbf2 + uv pip install wandb uv pip install nvitop run: | + export WANDB_RUN_ID=$SKYPILOT_TASK_ID + export WANDB_NAME=run-$SKYPILOT_TASK_ID source ~/training/bin/activate MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) @@ -80,7 +102,7 @@ run: | --machine_rank $SKYPILOT_NODE_RANK \ --main_process_ip $MASTER_ADDR \ --main_process_port 29500 \ - /sft/train.py --model_id openai/gpt-oss-120b + /sft/train.py --model_id openai/gpt-oss-120b --resume_from_checkpoint ``` ### LoRA finetuning @@ -112,6 +134,17 @@ sky logs sky down ``` +### Optional: WandB tracking + +To enable experiment tracking with Weights & Biases, set your API key in the YAML configuration: + +```yaml +envs: + WANDB_API_KEY: "your-wandb-api-key" +``` + +Each training run will automatically use a unique run ID based on the SkyPilot task ID for easy tracking and resuming. + ### Example full finetuning progress Here's what you can expect to see during training - the loss should decrease and token accuracy should improve over time: diff --git a/llm/gpt-oss-finetuning/gpt-oss-120b-lora.yaml b/llm/gpt-oss-finetuning/gpt-oss-120b-lora.yaml index 92c82e25e80..c67b6d1dbdb 100644 --- a/llm/gpt-oss-finetuning/gpt-oss-120b-lora.yaml +++ b/llm/gpt-oss-finetuning/gpt-oss-120b-lora.yaml @@ -1,9 +1,18 @@ resources: accelerators: H100:8 + disk_size: 1024 network_tier: best file_mounts: /sft: ./sft + # Uncomment to enable checkpoint persistence across cluster restarts by saving them to S3 + # /checkpoints: + # source: s3://my-skypilot-bucket # change this to your bucket + +envs: + WANDB_PROJECT: gpt-oss-120b-lora + WANDB_RESUME: allow + WANDB_API_KEY: "" # optionally, enable WandB tracking by providing the API key num_nodes: 1 @@ -15,15 +24,18 @@ setup: | uv pip install "trl>=0.20.0" "peft>=0.17.0" "transformers>=4.55.0" uv pip install deepspeed uv pip install git+https://github.com/huggingface/accelerate.git@c0a3aefea8aa5008a0fbf55b049bd3f0efa9cbf2 + uv pip install wandb uv pip install nvitop run: | + export WANDB_RUN_ID=$SKYPILOT_TASK_ID + export WANDB_NAME=run-$SKYPILOT_TASK_ID source ~/training/bin/activate MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) NP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES)) - python /sft/train.py --model_id openai/gpt-oss-120b --enable_lora + python /sft/train.py --model_id openai/gpt-oss-120b --enable_lora --resume_from_checkpoint \ No newline at end of file diff --git a/llm/gpt-oss-finetuning/gpt-oss-120b-sft.yaml b/llm/gpt-oss-finetuning/gpt-oss-120b-sft.yaml index 51dbaad2f59..bdaf114b616 100644 --- a/llm/gpt-oss-finetuning/gpt-oss-120b-sft.yaml +++ b/llm/gpt-oss-finetuning/gpt-oss-120b-sft.yaml @@ -1,9 +1,18 @@ resources: accelerators: H200:8 + disk_size: 1024 network_tier: best file_mounts: /sft: ./sft + # Uncomment to enable checkpoint persistence across cluster restarts by saving them to S3 + # /checkpoints: + # source: s3://my-skypilot-bucket # change this to your bucket + +envs: + WANDB_PROJECT: gpt-oss-120b-sft + WANDB_RESUME: allow + WANDB_API_KEY: "" # optionally, enable WandB tracking by providing the API key num_nodes: 4 @@ -15,14 +24,17 @@ setup: | uv pip install "trl>=0.20.0" "peft>=0.17.0" "transformers>=4.55.0" uv pip install deepspeed uv pip install git+https://github.com/huggingface/accelerate.git@c0a3aefea8aa5008a0fbf55b049bd3f0efa9cbf2 + uv pip install wandb uv pip install nvitop run: | + export WANDB_RUN_ID=$SKYPILOT_TASK_ID + export WANDB_NAME=run-$SKYPILOT_TASK_ID source ~/training/bin/activate MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) NP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES)) - accelerate launch --config_file /sft/fsdp2_120b.yaml --num_machines $SKYPILOT_NUM_NODES --num_processes $NP --machine_rank $SKYPILOT_NODE_RANK --main_process_ip $MASTER_ADDR --main_process_port 29500 /sft/train.py --model_id openai/gpt-oss-120b + accelerate launch --config_file /sft/fsdp2_120b.yaml --num_machines $SKYPILOT_NUM_NODES --num_processes $NP --machine_rank $SKYPILOT_NODE_RANK --main_process_ip $MASTER_ADDR --main_process_port 29500 /sft/train.py --model_id openai/gpt-oss-120b --resume_from_checkpoint diff --git a/llm/gpt-oss-finetuning/gpt-oss-20b-lora.yaml b/llm/gpt-oss-finetuning/gpt-oss-20b-lora.yaml index b497add7a83..23447ee93f0 100644 --- a/llm/gpt-oss-finetuning/gpt-oss-20b-lora.yaml +++ b/llm/gpt-oss-finetuning/gpt-oss-20b-lora.yaml @@ -1,9 +1,18 @@ resources: accelerators: H100:2 + disk_size: 512 network_tier: best file_mounts: /sft: ./sft + # Uncomment to enable checkpoint persistence across cluster restarts by saving them to S3 + # /checkpoints: + # source: s3://my-skypilot-bucket # change this to your bucket + +envs: + WANDB_PROJECT: gpt-oss-20b-lora + WANDB_RESUME: allow + WANDB_API_KEY: "" # optionally, enable WandB tracking by providing the API key num_nodes: 1 @@ -15,15 +24,18 @@ setup: | uv pip install "trl>=0.20.0" "peft>=0.17.0" "transformers>=4.55.0" uv pip install deepspeed uv pip install git+https://github.com/huggingface/accelerate.git@c0a3aefea8aa5008a0fbf55b049bd3f0efa9cbf2 + uv pip install wandb uv pip install nvitop run: | + export WANDB_RUN_ID=$SKYPILOT_TASK_ID + export WANDB_NAME=run-$SKYPILOT_TASK_ID source ~/training/bin/activate MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) NP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES)) - python /sft/train.py --model_id openai/gpt-oss-20b --enable_lora + python /sft/train.py --model_id openai/gpt-oss-20b --enable_lora --resume_from_checkpoint \ No newline at end of file diff --git a/llm/gpt-oss-finetuning/gpt-oss-20b-sft.yaml b/llm/gpt-oss-finetuning/gpt-oss-20b-sft.yaml index d326c1db21a..ac154203a55 100644 --- a/llm/gpt-oss-finetuning/gpt-oss-20b-sft.yaml +++ b/llm/gpt-oss-finetuning/gpt-oss-20b-sft.yaml @@ -1,9 +1,20 @@ +name: gpt-oss-20b-sft-finetuning + resources: accelerators: H100:8 + disk_size: 512 network_tier: best file_mounts: /sft: ./sft + # Uncomment to enable checkpoint persistence across cluster restarts by saving them to S3 + # /checkpoints: + # source: s3://my-skypilot-bucket # change this to your bucket + +envs: + WANDB_PROJECT: gpt-oss-20b-sft + WANDB_RESUME: allow + WANDB_API_KEY: "" # optionally, enable WandB tracking by providing the API key num_nodes: 1 @@ -15,15 +26,16 @@ setup: | uv pip install "trl>=0.20.0" "peft>=0.17.0" "transformers>=4.55.0" uv pip install deepspeed uv pip install git+https://github.com/huggingface/accelerate.git@c0a3aefea8aa5008a0fbf55b049bd3f0efa9cbf2 + uv pip install wandb uv pip install nvitop run: | + export WANDB_RUN_ID=$SKYPILOT_TASK_ID + export WANDB_NAME=run-$SKYPILOT_TASK_ID source ~/training/bin/activate MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) NP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES)) - accelerate launch --config_file /sft/fsdp2.yaml --num_machines $SKYPILOT_NUM_NODES --num_processes $NP --machine_rank $SKYPILOT_NODE_RANK --main_process_ip $MASTER_ADDR --main_process_port 29500 /sft/train.py --model_id openai/gpt-oss-20b - - \ No newline at end of file + accelerate launch --config_file /sft/fsdp2.yaml --num_machines $SKYPILOT_NUM_NODES --num_processes $NP --machine_rank $SKYPILOT_NODE_RANK --main_process_ip $MASTER_ADDR --main_process_port 29500 /sft/train.py --model_id openai/gpt-oss-20b --resume_from_checkpoint diff --git a/llm/gpt-oss-finetuning/sft/train.py b/llm/gpt-oss-finetuning/sft/train.py index 9f5f744e88b..abe21a3875f 100644 --- a/llm/gpt-oss-finetuning/sft/train.py +++ b/llm/gpt-oss-finetuning/sft/train.py @@ -25,6 +25,29 @@ def training_step(self, *args, **kwargs): self.accelerator_profiler.step() return result + def train(self, resume_from_checkpoint=None, *args, **kwargs): + if resume_from_checkpoint or (self.args.resume_from_checkpoint and + os.path.exists(self.args.output_dir)): + checkpoint_path = resume_from_checkpoint + if not checkpoint_path: + # Find the latest checkpoint + checkpoint_dirs = [ + d for d in os.listdir(self.args.output_dir) + if d.startswith("checkpoint-") and + os.path.isdir(os.path.join(self.args.output_dir, d)) + ] + if checkpoint_dirs: + checkpoint_dirs.sort(key=lambda x: int(x.split("-")[1])) + checkpoint_path = os.path.join(self.args.output_dir, + checkpoint_dirs[-1]) + + if checkpoint_path: + print(f"Resuming from checkpoint: {checkpoint_path}") + + return super().train(resume_from_checkpoint=resume_from_checkpoint, + *args, + **kwargs) + def main(): # Parse command line arguments @@ -53,6 +76,16 @@ def main(): type=int, default=1, help="Training batch size per device (default: 1)") + parser.add_argument( + "--resume_from_checkpoint", + action="store_true", + default=False, + help="Enable resuming from the latest checkpoint (default: False)") + parser.add_argument( + "--output_dir", + type=str, + default="/checkpoints", + help="Directory to save checkpoints (default: /checkpoints)") args = parser.parse_args() # Setup profiling if enabled @@ -121,9 +154,15 @@ def trace_handler(p): model = get_peft_model(model, peft_config) model.print_trainable_parameters() + report_to = "wandb" if os.environ.get("WANDB_API_KEY") else "none" + + # Setup output directory for checkpoints + output_dir = os.path.join(args.output_dir, model_id.replace('/', '-')) + os.makedirs(output_dir, exist_ok=True) + # Train model training_args = SFTConfig( - output_dir=f"{model_id}-checkpoint", + output_dir=output_dir, learning_rate=2e-4, num_train_epochs=1, logging_steps=1, @@ -134,6 +173,13 @@ def trace_handler(p): lr_scheduler_type="cosine_with_min_lr", lr_scheduler_kwargs={"min_lr_rate": 0.1}, dataset_num_proc=num_proc, + gradient_checkpointing= + False, # Disable gradient_checkpointing as we use FSDP activation_checkpointing + report_to=report_to, + save_strategy="steps", + save_steps=100, + save_total_limit=3, + resume_from_checkpoint=args.resume_from_checkpoint, ) # Train model with optional profiling