update configs (#2128)

felipemello1 · Felipe Mello · web-flow · commit 26b2200010a3 · 2024-12-06T18:59:33.000-05:00
Co-authored-by: Felipe Mello &lt;felipemello@fb.com&gt;
diff --git a/recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml b/recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml
@@ -23,8 +23,8 @@ model:
   lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
   apply_lora_to_mlp: True
   apply_lora_to_output: False
-  lora_rank: 64
-  lora_alpha: 128
+  lora_rank: 64  # higher increases accuracy and memory
+  lora_alpha: 128  # usually alpha=2*rank
   lora_dropout: 0.0
 
 teacher_model:
diff --git a/recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml b/recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml
@@ -23,8 +23,8 @@ model:
   lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
   apply_lora_to_mlp: True
   apply_lora_to_output: False
-  lora_rank: 64
-  lora_alpha: 128
+  lora_rank: 64  # higher increases accuracy and memory
+  lora_alpha: 128  # usually alpha=2*rank
   lora_dropout: 0.0
 
 teacher_model:
diff --git a/recipes/configs/llama3_3/70B_full.yaml b/recipes/configs/llama3_3/70B_full.yaml
@@ -16,6 +16,8 @@
 # This config is only tested on an 8xA100 machine.
 #
 
+output_dir: /tmp/torchtune/llama3_3_70B/full # /tmp may be deleted by your system. Change it to your preference.
+
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
@@ -69,7 +71,7 @@ checkpointer:
     model-00030-of-00030.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Llama-3.3-70B-Instruct/
+  output_dir: ${output_dir}
   model_type: LLAMA3
 resume_from_checkpoint: False
 
@@ -87,7 +89,7 @@ optimizer:
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
+gradient_accumulation_steps: 1  # Use to increase effective batch size
 
 
 # Training env
@@ -98,7 +100,7 @@ enable_activation_checkpointing: True  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 custom_sharded_layers: ['tok_embeddings', 'output']  # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
 fsdp_cpu_offload: True
-compile: False  # pytorch compile, set to true for better perf/memory
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 # Reduced precision
@@ -107,8 +109,7 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/full-llama3_3-finetune
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
diff --git a/recipes/configs/llama3_3/70B_lora.yaml b/recipes/configs/llama3_3/70B_lora.yaml
@@ -8,6 +8,8 @@
 # This config needs 8 GPUs to run
 #   tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_3/70B_lora
 
+output_dir: /tmp/torchtune/llama3_3_70B/lora # /tmp may be deleted by your system. Change it to your preference.
+
 # Model Arguments
 model:
   _component_: torchtune.models.llama3_3.lora_llama3_3_70b
@@ -59,7 +61,7 @@ checkpointer:
     model-00030-of-00030.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Llama-3.3-70B-Instruct/
+  output_dir: ${output_dir}
   model_type: LLAMA3
 resume_from_checkpoint: False
 save_adapter_weights_only: True # Set to false to save the whole model + adapter merged
@@ -88,14 +90,13 @@ loss:
 # Training
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
+gradient_accumulation_steps: 1  # Use to increase effective batch size
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Logging
-output_dir: /tmp/lora-llama3_3-finetune-output
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
diff --git a/recipes/configs/llama3_3/70B_qlora.yaml b/recipes/configs/llama3_3/70B_qlora.yaml
@@ -8,6 +8,8 @@
 # This config needs 8 GPUs to run
 #   tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_3/70B_lora
 
+output_dir: /tmp/torchtune/llama3_3_70B/qlora # /tmp may be deleted by your system. Change it to your preference.
+
 # Model Arguments
 model:
   _component_: torchtune.models.llama3_3.qlora_llama3_3_70b
@@ -59,7 +61,7 @@ checkpointer:
     model-00030-of-00030.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Llama-3.3-70B-Instruct/
+  output_dir: ${output_dir}
   model_type: LLAMA3
 resume_from_checkpoint: False
 save_adapter_weights_only: True # Set to false to save the whole model + adapter merged
@@ -88,14 +90,13 @@ loss:
 # Training
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
+gradient_accumulation_steps: 1  # Use to increase effective batch size
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Logging
-output_dir: /tmp/lora-llama3_3-finetune-output
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
diff --git a/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml b/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml
@@ -20,10 +20,10 @@ output_dir: /tmp/torchtune/qwen2_1_5_to_0_5B/KD_lora_distributed # /tmp may be d
 # Model Arguments
 model:
   _component_: torchtune.models.qwen2.lora_qwen2_0_5b
-  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
-  apply_lora_to_mlp: False
-  lora_rank: 32
-  lora_alpha: 64
+  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  lora_rank: 32  # higher increases accuracy and memory
+  lora_alpha: 64  # usually alpha=2*rank
 
 teacher_model:
   _component_: torchtune.models.qwen2.qwen2_1_5b
diff --git a/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml b/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml
@@ -20,10 +20,10 @@ output_dir: /tmp/torchtune/qwen2_1_5_to_0_5B/KD_lora_single_device # /tmp may be
 # Model Arguments
 model:
   _component_: torchtune.models.qwen2.lora_qwen2_0_5b
-  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
-  apply_lora_to_mlp: False
-  lora_rank: 32
-  lora_alpha: 64
+  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  lora_rank: 32  # higher increases accuracy and memory
+  lora_alpha: 64  # usually alpha=2*rank
 
 teacher_model:
   _component_: torchtune.models.qwen2.qwen2_1_5b