skypilot-org · romilbhardwaj · Nov 5, 2025 · Oct 21, 2025 · Oct 22, 2025 · Oct 31, 2025
diff --git a/llm/gpt-oss-finetuning/README.md b/llm/gpt-oss-finetuning/README.md
@@ -29,6 +29,18 @@ For more details on how to setup your cloud credentials see [SkyPilot docs](http
 sky check
 ```
 
+### Configure checkpoint storage (Optional)
+
+Checkpoint storage is optional and only needed if you want to resume training from interruptions. By default, checkpoints are saved locally on the cluster.
+
+To enable checkpoint persistence across cluster restarts, uncomment and configure the S3 bucket in the YAML files:
+
+```yaml
+file_mounts:
+  /checkpoints:
+    source: s3://my-skypilot-bucket  # change this to your bucket
+```
+
 ## Step 1: Run gpt-oss models
 
 ### Full finetuning
@@ -53,6 +65,13 @@ resources:
 
 file_mounts:
   /sft: ./sft
+  /checkpoints:
+    source: s3://my-skypilot-bucket  # change this to your bucket
+
+envs:
+  WANDB_PROJECT: gpt-oss-120b-sft
+  WANDB_RESUME: allow
+  WANDB_API_KEY: ""  # optionally, enable WandB tracking by providing the API key
 
 num_nodes: 4
 
@@ -64,10 +83,13 @@ setup: |
   uv pip install "trl>=0.20.0" "peft>=0.17.0" "transformers>=4.55.0"
   uv pip install deepspeed
   uv pip install git+https://github.com/huggingface/accelerate.git@c0a3aefea8aa5008a0fbf55b049bd3f0efa9cbf2
+  uv pip install wandb
 
   uv pip install nvitop
 
 run: |
+  export WANDB_RUN_ID=$SKYPILOT_TASK_ID
+  export WANDB_NAME=run-$SKYPILOT_TASK_ID
   source ~/training/bin/activate
 
   MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
@@ -80,7 +102,7 @@ run: |
     --machine_rank $SKYPILOT_NODE_RANK \
     --main_process_ip $MASTER_ADDR \
     --main_process_port 29500 \
-    /sft/train.py --model_id openai/gpt-oss-120b
+    /sft/train.py --model_id openai/gpt-oss-120b --resume_from_checkpoint
 ```
 
 ### LoRA finetuning
@@ -112,6 +134,17 @@ sky logs <cluster-name>
 sky down <cluster-name>
 ```
 
+### Optional: WandB tracking
+
+To enable experiment tracking with Weights & Biases, set your API key in the YAML configuration:
+
+```yaml
+envs:
+  WANDB_API_KEY: "your-wandb-api-key"
+```
+
+Each training run will automatically use a unique run ID based on the SkyPilot task ID for easy tracking and resuming.
+
 ### Example full finetuning progress
 
 Here's what you can expect to see during training - the loss should decrease and token accuracy should improve over time:

diff --git a/llm/gpt-oss-finetuning/gpt-oss-120b-lora.yaml b/llm/gpt-oss-finetuning/gpt-oss-120b-lora.yaml
@@ -1,9 +1,18 @@
 resources:
   accelerators: H100:8
+  disk_size: 1024
   network_tier: best
 
 file_mounts:
   /sft: ./sft
+  # Uncomment to enable checkpoint persistence across cluster restarts by saving them to S3
+  # /checkpoints:
+  #   source: s3://my-skypilot-bucket # change this to your bucket
+
+envs:
+  WANDB_PROJECT: gpt-oss-120b-lora
+  WANDB_RESUME: allow
+  WANDB_API_KEY: "" # optionally, enable WandB tracking by providing the API key
 
 num_nodes: 1
 
@@ -15,15 +24,18 @@ setup: |
   uv pip install "trl>=0.20.0" "peft>=0.17.0" "transformers>=4.55.0"
   uv pip install deepspeed
   uv pip install git+https://github.com/huggingface/accelerate.git@c0a3aefea8aa5008a0fbf55b049bd3f0efa9cbf2
+  uv pip install wandb
 
   uv pip install nvitop
 
 run: |
+  export WANDB_RUN_ID=$SKYPILOT_TASK_ID
+  export WANDB_NAME=run-$SKYPILOT_TASK_ID
   source ~/training/bin/activate
 
   MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
   NP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES))
 
-  python /sft/train.py --model_id openai/gpt-oss-120b --enable_lora
+  python /sft/train.py --model_id openai/gpt-oss-120b --enable_lora --resume_from_checkpoint
 
 
diff --git a/llm/gpt-oss-finetuning/gpt-oss-120b-sft.yaml b/llm/gpt-oss-finetuning/gpt-oss-120b-sft.yaml
@@ -1,9 +1,18 @@
 resources:
   accelerators: H200:8
+  disk_size: 1024
   network_tier: best
 
 file_mounts:
   /sft: ./sft
+  # Uncomment to enable checkpoint persistence across cluster restarts by saving them to S3
+  # /checkpoints:
+  #   source: s3://my-skypilot-bucket # change this to your bucket
+
+envs:
+  WANDB_PROJECT: gpt-oss-120b-sft
+  WANDB_RESUME: allow
+  WANDB_API_KEY: "" # optionally, enable WandB tracking by providing the API key
 
 num_nodes: 4
 
@@ -15,14 +24,17 @@ setup: |
   uv pip install "trl>=0.20.0" "peft>=0.17.0" "transformers>=4.55.0"
   uv pip install deepspeed
   uv pip install git+https://github.com/huggingface/accelerate.git@c0a3aefea8aa5008a0fbf55b049bd3f0efa9cbf2
+  uv pip install wandb
 
   uv pip install nvitop
 
 run: |
+  export WANDB_RUN_ID=$SKYPILOT_TASK_ID
+  export WANDB_NAME=run-$SKYPILOT_TASK_ID
   source ~/training/bin/activate
 
   MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
   NP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES))
 
-  accelerate launch --config_file /sft/fsdp2_120b.yaml --num_machines $SKYPILOT_NUM_NODES --num_processes $NP --machine_rank $SKYPILOT_NODE_RANK --main_process_ip $MASTER_ADDR --main_process_port 29500 /sft/train.py --model_id openai/gpt-oss-120b
+  accelerate launch --config_file /sft/fsdp2_120b.yaml --num_machines $SKYPILOT_NUM_NODES --num_processes $NP --machine_rank $SKYPILOT_NODE_RANK --main_process_ip $MASTER_ADDR --main_process_port 29500 /sft/train.py --model_id openai/gpt-oss-120b --resume_from_checkpoint
 
diff --git a/llm/gpt-oss-finetuning/gpt-oss-20b-lora.yaml b/llm/gpt-oss-finetuning/gpt-oss-20b-lora.yaml
@@ -1,9 +1,18 @@
 resources:
   accelerators: H100:2
+  disk_size: 512
   network_tier: best
 
 file_mounts:
   /sft: ./sft
+  # Uncomment to enable checkpoint persistence across cluster restarts by saving them to S3
+  # /checkpoints:
+  #   source: s3://my-skypilot-bucket # change this to your bucket
+
+envs:
+  WANDB_PROJECT: gpt-oss-20b-lora
+  WANDB_RESUME: allow
+  WANDB_API_KEY: "" # optionally, enable WandB tracking by providing the API key
 
 num_nodes: 1
 
@@ -15,15 +24,18 @@ setup: |
   uv pip install "trl>=0.20.0" "peft>=0.17.0" "transformers>=4.55.0"
   uv pip install deepspeed
   uv pip install git+https://github.com/huggingface/accelerate.git@c0a3aefea8aa5008a0fbf55b049bd3f0efa9cbf2
+  uv pip install wandb
 
   uv pip install nvitop
 
 run: |
+  export WANDB_RUN_ID=$SKYPILOT_TASK_ID
+  export WANDB_NAME=run-$SKYPILOT_TASK_ID
   source ~/training/bin/activate
 
   MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
   NP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES))
 
-  python /sft/train.py --model_id openai/gpt-oss-20b --enable_lora
+  python /sft/train.py --model_id openai/gpt-oss-20b --enable_lora --resume_from_checkpoint
 
 
diff --git a/llm/gpt-oss-finetuning/gpt-oss-20b-sft.yaml b/llm/gpt-oss-finetuning/gpt-oss-20b-sft.yaml
@@ -1,9 +1,20 @@
+name: gpt-oss-20b-sft-finetuning
+
 resources:
   accelerators: H100:8
+  disk_size: 512
   network_tier: best
 
 file_mounts:
   /sft: ./sft
+  # Uncomment to enable checkpoint persistence across cluster restarts by saving them to S3
+  # /checkpoints:
+  #   source: s3://my-skypilot-bucket # change this to your bucket
+
+envs:
+  WANDB_PROJECT: gpt-oss-20b-sft
+  WANDB_RESUME: allow
+  WANDB_API_KEY: "" # optionally, enable WandB tracking by providing the API key
 
 num_nodes: 1
 
@@ -15,15 +26,16 @@ setup: |
   uv pip install "trl>=0.20.0" "peft>=0.17.0" "transformers>=4.55.0"
   uv pip install deepspeed
   uv pip install git+https://github.com/huggingface/accelerate.git@c0a3aefea8aa5008a0fbf55b049bd3f0efa9cbf2
+  uv pip install wandb
 
   uv pip install nvitop
 
 run: |
+  export WANDB_RUN_ID=$SKYPILOT_TASK_ID
+  export WANDB_NAME=run-$SKYPILOT_TASK_ID
   source ~/training/bin/activate
 
   MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
   NP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES))
 
-  accelerate launch --config_file /sft/fsdp2.yaml --num_machines $SKYPILOT_NUM_NODES --num_processes $NP --machine_rank $SKYPILOT_NODE_RANK --main_process_ip $MASTER_ADDR --main_process_port 29500 /sft/train.py --model_id openai/gpt-oss-20b
-
-
+  accelerate launch --config_file /sft/fsdp2.yaml --num_machines $SKYPILOT_NUM_NODES --num_processes $NP --machine_rank $SKYPILOT_NODE_RANK --main_process_ip $MASTER_ADDR --main_process_port 29500 /sft/train.py --model_id openai/gpt-oss-20b --resume_from_checkpoint
diff --git a/llm/gpt-oss-finetuning/sft/train.py b/llm/gpt-oss-finetuning/sft/train.py
@@ -25,6 +25,29 @@ def training_step(self, *args, **kwargs):
             self.accelerator_profiler.step()
         return result
 
+    def train(self, resume_from_checkpoint=None, *args, **kwargs):
+        if resume_from_checkpoint or (self.args.resume_from_checkpoint and
+                                      os.path.exists(self.args.output_dir)):
+            checkpoint_path = resume_from_checkpoint
+            if not checkpoint_path:
+                # Find the latest checkpoint
+                checkpoint_dirs = [
+                    d for d in os.listdir(self.args.output_dir)
+                    if d.startswith("checkpoint-") and
+                    os.path.isdir(os.path.join(self.args.output_dir, d))
+                ]
+                if checkpoint_dirs:
+                    checkpoint_dirs.sort(key=lambda x: int(x.split("-")[1]))
+                    checkpoint_path = os.path.join(self.args.output_dir,
+                                                   checkpoint_dirs[-1])
+
+            if checkpoint_path:
+                print(f"Resuming from checkpoint: {checkpoint_path}")
+
+        return super().train(resume_from_checkpoint=resume_from_checkpoint,
+                             *args,
+                             **kwargs)
+
 
 def main():
     # Parse command line arguments
@@ -53,6 +76,16 @@ def main():
                         type=int,
                         default=1,
                         help="Training batch size per device (default: 1)")
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        action="store_true",
+        default=False,
+        help="Enable resuming from the latest checkpoint (default: False)")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="/checkpoints",
+        help="Directory to save checkpoints (default: /checkpoints)")
     args = parser.parse_args()
 
     # Setup profiling if enabled
@@ -121,9 +154,15 @@ def trace_handler(p):
         model = get_peft_model(model, peft_config)
         model.print_trainable_parameters()
 
+    report_to = "wandb" if os.environ.get("WANDB_API_KEY") else "none"
+
+    # Setup output directory for checkpoints
+    output_dir = os.path.join(args.output_dir, model_id.replace('/', '-'))
+    os.makedirs(output_dir, exist_ok=True)
+
     # Train model
     training_args = SFTConfig(
-        output_dir=f"{model_id}-checkpoint",
+        output_dir=output_dir,
         learning_rate=2e-4,
         num_train_epochs=1,
         logging_steps=1,
@@ -134,6 +173,13 @@ def trace_handler(p):
         lr_scheduler_type="cosine_with_min_lr",
         lr_scheduler_kwargs={"min_lr_rate": 0.1},
         dataset_num_proc=num_proc,
+        gradient_checkpointing=
+        False,  # Disable gradient_checkpointing as we use FSDP activation_checkpointing
+        report_to=report_to,
+        save_strategy="steps",
+        save_steps=100,
+        save_total_limit=3,
+        resume_from_checkpoint=args.resume_from_checkpoint,
     )
 
     # Train model with optional profiling