|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +NeMo Fine-tuning with Lepton Executor |
| 4 | +
|
| 5 | +This comprehensive example demonstrates how to use the LeptonExecutor for distributed |
| 6 | +NeMo model fine-tuning with various advanced features. |
| 7 | +
|
| 8 | +Prerequisites: |
| 9 | +- Lepton workspace with proper node groups and GPU resources |
| 10 | +- Secrets configured in your Lepton workspace (optional but recommended) |
| 11 | +- Shared storage accessible to your Lepton cluster |
| 12 | +- NeMo container image available |
| 13 | +
|
| 14 | +This example serves as a template for production ML workflows on Lepton clusters. |
| 15 | +""" |
| 16 | + |
| 17 | +from nemo.collections import llm |
| 18 | +import nemo_run as run |
| 19 | +from nemo import lightning as nl |
| 20 | + |
| 21 | + |
| 22 | +def nemo_lepton_executor(nodes: int, devices: int, container_image: str): |
| 23 | + """ |
| 24 | + Create a LeptonExecutor with secret handling capabilities. |
| 25 | +
|
| 26 | + Args: |
| 27 | + nodes: Number of nodes for distributed training |
| 28 | + devices: Number of GPUs per node |
| 29 | + container_image: Docker container image to use |
| 30 | +
|
| 31 | + Returns: |
| 32 | + Configured LeptonExecutor with secret support |
| 33 | + """ |
| 34 | + |
| 35 | + return run.LeptonExecutor( |
| 36 | + # Required parameters |
| 37 | + container_image=container_image, |
| 38 | + nemo_run_dir="/nemo-workspace", # Directory for NeMo Run files on remote storage |
| 39 | + # Lepton compute configuration |
| 40 | + nodes=nodes, |
| 41 | + gpus_per_node=devices, |
| 42 | + nprocs_per_node=devices, # Number of processes per node (usually = gpus_per_node) |
| 43 | + # Lepton workspace configuration - REQUIRED for actual usage |
| 44 | + resource_shape="gpu.1xh200", # Specify GPU type/count - adjust as needed |
| 45 | + node_group="your-node-group-name", # Specify your node group - must exist in workspace |
| 46 | + # Remote storage mounts (using correct mount structure) |
| 47 | + mounts=[ |
| 48 | + { |
| 49 | + "from": "node-nfs:your-shared-storage", |
| 50 | + "path": "/path/to/your/remote/storage", # Remote storage path |
| 51 | + "mount_path": "/nemo-workspace", # Mount path in container |
| 52 | + } |
| 53 | + ], |
| 54 | + # Environment variables - SECURE SECRET HANDLING |
| 55 | + env_vars={ |
| 56 | + # SECRET REFERENCES (recommended for sensitive data) |
| 57 | + # These reference secrets stored securely in your Lepton workspace |
| 58 | + "HF_TOKEN": {"value_from": {"secret_name_ref": "HUGGING_FACE_HUB_TOKEN_read"}}, |
| 59 | + "WANDB_API_KEY": { |
| 60 | + "value_from": {"secret_name_ref": "WANDB_API_KEY_secret"} |
| 61 | + }, # Optional |
| 62 | + # 📋 REGULAR ENVIRONMENT VARIABLES |
| 63 | + # Non-sensitive configuration can be set directly |
| 64 | + "NCCL_DEBUG": "INFO", |
| 65 | + "TORCH_DISTRIBUTED_DEBUG": "INFO", |
| 66 | + "CUDA_LAUNCH_BLOCKING": "1", |
| 67 | + "TOKENIZERS_PARALLELISM": "false", |
| 68 | + }, |
| 69 | + # Shared memory size for inter-process communication |
| 70 | + shared_memory_size=65536, |
| 71 | + # Custom commands to run before launching the training |
| 72 | + pre_launch_commands=[ |
| 73 | + "echo '🚀 Starting NeMo fine-tuning with Lepton secrets...'", |
| 74 | + "nvidia-smi", |
| 75 | + "df -h", |
| 76 | + "python3 -m pip install 'datasets>=4.0.0'", |
| 77 | + "python3 -m pip install 'transformers>=4.40.0'", |
| 78 | + ], |
| 79 | + ) |
| 80 | + |
| 81 | + |
| 82 | +def create_finetune_recipe(nodes: int, gpus_per_node: int): |
| 83 | + """ |
| 84 | + Create a NeMo fine-tuning recipe with LoRA. |
| 85 | +
|
| 86 | + Args: |
| 87 | + nodes: Number of nodes for distributed training |
| 88 | + gpus_per_node: Number of GPUs per node |
| 89 | +
|
| 90 | + Returns: |
| 91 | + Configured NeMo recipe for fine-tuning |
| 92 | + """ |
| 93 | + |
| 94 | + recipe = llm.hf_auto_model_for_causal_lm.finetune_recipe( |
| 95 | + model_name="meta-llama/Llama-3.2-3B", # Model to fine-tune |
| 96 | + dir="/nemo-workspace/llama3.2_3b_lepton", # Use nemo-workspace mount path |
| 97 | + name="llama3_lora_lepton", |
| 98 | + num_nodes=nodes, |
| 99 | + num_gpus_per_node=gpus_per_node, |
| 100 | + peft_scheme="lora", # Parameter-Efficient Fine-Tuning with LoRA |
| 101 | + max_steps=100, # Adjust based on your needs |
| 102 | + ) |
| 103 | + |
| 104 | + # LoRA configuration |
| 105 | + recipe.peft.target_modules = ["linear_qkv", "linear_proj", "linear_fc1", "*_proj"] |
| 106 | + recipe.peft.dim = 16 |
| 107 | + recipe.peft.alpha = 32 |
| 108 | + |
| 109 | + # Strategy configuration for distributed training |
| 110 | + if nodes == 1: |
| 111 | + recipe.trainer.strategy = "auto" # Let Lightning choose the best strategy |
| 112 | + else: |
| 113 | + recipe.trainer.strategy = run.Config( |
| 114 | + nl.FSDP2Strategy, data_parallel_size=nodes * gpus_per_node, tensor_parallel_size=1 |
| 115 | + ) |
| 116 | + |
| 117 | + return recipe |
| 118 | + |
| 119 | + |
| 120 | +if __name__ == "__main__": |
| 121 | + # Configuration |
| 122 | + nodes = 1 # Start with single node for testing |
| 123 | + gpus_per_node = 1 |
| 124 | + |
| 125 | + # Create the fine-tuning recipe |
| 126 | + recipe = create_finetune_recipe(nodes, gpus_per_node) |
| 127 | + |
| 128 | + # Create the executor with secret handling |
| 129 | + executor = nemo_lepton_executor( |
| 130 | + nodes=nodes, |
| 131 | + devices=gpus_per_node, |
| 132 | + container_image="nvcr.io/nvidia/nemo:25.04", # Use appropriate NeMo container |
| 133 | + ) |
| 134 | + |
| 135 | + # Optional: Check executor capabilities |
| 136 | + print("🔍 Executor Information:") |
| 137 | + print(f"📋 Nodes: {executor.nnodes()}") |
| 138 | + print(f"📋 Processes per node: {executor.nproc_per_node()}") |
| 139 | + |
| 140 | + # Check macro support |
| 141 | + macro_values = executor.macro_values() |
| 142 | + print(f"📋 Macro values support: {macro_values is not None}") |
| 143 | + |
| 144 | + try: |
| 145 | + # Create and run the experiment |
| 146 | + with run.Experiment( |
| 147 | + "lepton-nemo-secrets-demo", executor=executor, log_level="DEBUG" |
| 148 | + ) as exp: |
| 149 | + # Add the fine-tuning task |
| 150 | + task_id = exp.add(recipe, tail_logs=True, name="llama3_lora_with_secrets") |
| 151 | + |
| 152 | + # Execute the experiment |
| 153 | + print("Starting fine-tuning experiment with secure secret handling...") |
| 154 | + exp.run(detach=False, tail_logs=True, sequential=True) |
| 155 | + |
| 156 | + print("Experiment completed successfully!") |
| 157 | + |
| 158 | + except Exception as e: |
| 159 | + print(f"\n Error occurred: {type(e).__name__}") |
| 160 | + print(f" Message: {str(e)}") |
| 161 | + print("\n💡 Common issues to check:") |
| 162 | + print(" - Ensure your Lepton workspace has the required secrets configured") |
| 163 | + print(" - Verify node_group and resource_shape match your workspace") |
| 164 | + print(" - Check that mount paths are correct and accessible") |
| 165 | + print(" - Confirm container image is available and compatible") |
0 commit comments