Skip to content

Commit 26b2200

Browse files
felipemello1Felipe Mello
and
Felipe Mello
authored
update configs (#2128)
Co-authored-by: Felipe Mello <[email protected]>
1 parent fef2c80 commit 26b2200

7 files changed

+30
-27
lines changed

recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ model:
2323
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
2424
apply_lora_to_mlp: True
2525
apply_lora_to_output: False
26-
lora_rank: 64
27-
lora_alpha: 128
26+
lora_rank: 64 # higher increases accuracy and memory
27+
lora_alpha: 128 # usually alpha=2*rank
2828
lora_dropout: 0.0
2929

3030
teacher_model:

recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ model:
2323
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
2424
apply_lora_to_mlp: True
2525
apply_lora_to_output: False
26-
lora_rank: 64
27-
lora_alpha: 128
26+
lora_rank: 64 # higher increases accuracy and memory
27+
lora_alpha: 128 # usually alpha=2*rank
2828
lora_dropout: 0.0
2929

3030
teacher_model:

recipes/configs/llama3_3/70B_full.yaml

+6-5
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
# This config is only tested on an 8xA100 machine.
1717
#
1818

19+
output_dir: /tmp/torchtune/llama3_3_70B/full # /tmp may be deleted by your system. Change it to your preference.
20+
1921
# Tokenizer
2022
tokenizer:
2123
_component_: torchtune.models.llama3.llama3_tokenizer
@@ -69,7 +71,7 @@ checkpointer:
6971
model-00030-of-00030.safetensors,
7072
]
7173
recipe_checkpoint: null
72-
output_dir: /tmp/Llama-3.3-70B-Instruct/
74+
output_dir: ${output_dir}
7375
model_type: LLAMA3
7476
resume_from_checkpoint: False
7577

@@ -87,7 +89,7 @@ optimizer:
8789
loss:
8890
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
8991
max_steps_per_epoch: null
90-
gradient_accumulation_steps: 1 # Use to increase virtual batch size
92+
gradient_accumulation_steps: 1 # Use to increase effective batch size
9193

9294

9395
# Training env
@@ -98,7 +100,7 @@ enable_activation_checkpointing: True # True reduces memory
98100
enable_activation_offloading: False # True reduces memory
99101
custom_sharded_layers: ['tok_embeddings', 'output'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
100102
fsdp_cpu_offload: True
101-
compile: False # pytorch compile, set to true for better perf/memory
103+
compile: False # torch.compile the model + loss, True increases speed + decreases memory
102104
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
103105

104106
# Reduced precision
@@ -107,8 +109,7 @@ dtype: bf16
107109
# Logging
108110
metric_logger:
109111
_component_: torchtune.training.metric_logging.DiskLogger
110-
log_dir: ${output_dir}
111-
output_dir: /tmp/full-llama3_3-finetune
112+
log_dir: ${output_dir}/logs
112113
log_every_n_steps: 1
113114
log_peak_memory_stats: True
114115

recipes/configs/llama3_3/70B_lora.yaml

+6-5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
# This config needs 8 GPUs to run
99
# tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_3/70B_lora
1010

11+
output_dir: /tmp/torchtune/llama3_3_70B/lora # /tmp may be deleted by your system. Change it to your preference.
12+
1113
# Model Arguments
1214
model:
1315
_component_: torchtune.models.llama3_3.lora_llama3_3_70b
@@ -59,7 +61,7 @@ checkpointer:
5961
model-00030-of-00030.safetensors,
6062
]
6163
recipe_checkpoint: null
62-
output_dir: /tmp/Llama-3.3-70B-Instruct/
64+
output_dir: ${output_dir}
6365
model_type: LLAMA3
6466
resume_from_checkpoint: False
6567
save_adapter_weights_only: True # Set to false to save the whole model + adapter merged
@@ -88,14 +90,13 @@ loss:
8890
# Training
8991
epochs: 1
9092
max_steps_per_epoch: null
91-
gradient_accumulation_steps: 1 # Use to increase virtual batch size
92-
compile: False # pytorch compile, set to true for better perf/memory
93+
gradient_accumulation_steps: 1 # Use to increase effective batch size
94+
compile: False # torch.compile the model + loss, True increases speed + decreases memory
9395

9496
# Logging
95-
output_dir: /tmp/lora-llama3_3-finetune-output
9697
metric_logger:
9798
_component_: torchtune.training.metric_logging.DiskLogger
98-
log_dir: ${output_dir}
99+
log_dir: ${output_dir}/logs
99100
log_every_n_steps: 1
100101
log_peak_memory_stats: True
101102

recipes/configs/llama3_3/70B_qlora.yaml

+6-5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
# This config needs 8 GPUs to run
99
# tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_3/70B_lora
1010

11+
output_dir: /tmp/torchtune/llama3_3_70B/qlora # /tmp may be deleted by your system. Change it to your preference.
12+
1113
# Model Arguments
1214
model:
1315
_component_: torchtune.models.llama3_3.qlora_llama3_3_70b
@@ -59,7 +61,7 @@ checkpointer:
5961
model-00030-of-00030.safetensors,
6062
]
6163
recipe_checkpoint: null
62-
output_dir: /tmp/Llama-3.3-70B-Instruct/
64+
output_dir: ${output_dir}
6365
model_type: LLAMA3
6466
resume_from_checkpoint: False
6567
save_adapter_weights_only: True # Set to false to save the whole model + adapter merged
@@ -88,14 +90,13 @@ loss:
8890
# Training
8991
epochs: 1
9092
max_steps_per_epoch: null
91-
gradient_accumulation_steps: 1 # Use to increase virtual batch size
92-
compile: False # pytorch compile, set to true for better perf/memory
93+
gradient_accumulation_steps: 1 # Use to increase effective batch size
94+
compile: False # torch.compile the model + loss, True increases speed + decreases memory
9395

9496
# Logging
95-
output_dir: /tmp/lora-llama3_3-finetune-output
9697
metric_logger:
9798
_component_: torchtune.training.metric_logging.DiskLogger
98-
log_dir: ${output_dir}
99+
log_dir: ${output_dir}/logs
99100
log_every_n_steps: 1
100101
log_peak_memory_stats: True
101102

recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml

+4-4
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ output_dir: /tmp/torchtune/qwen2_1_5_to_0_5B/KD_lora_distributed # /tmp may be d
2020
# Model Arguments
2121
model:
2222
_component_: torchtune.models.qwen2.lora_qwen2_0_5b
23-
lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
24-
apply_lora_to_mlp: False
25-
lora_rank: 32
26-
lora_alpha: 64
23+
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
24+
apply_lora_to_mlp: True
25+
lora_rank: 32 # higher increases accuracy and memory
26+
lora_alpha: 64 # usually alpha=2*rank
2727

2828
teacher_model:
2929
_component_: torchtune.models.qwen2.qwen2_1_5b

recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml

+4-4
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ output_dir: /tmp/torchtune/qwen2_1_5_to_0_5B/KD_lora_single_device # /tmp may be
2020
# Model Arguments
2121
model:
2222
_component_: torchtune.models.qwen2.lora_qwen2_0_5b
23-
lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
24-
apply_lora_to_mlp: False
25-
lora_rank: 32
26-
lora_alpha: 64
23+
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
24+
apply_lora_to_mlp: True
25+
lora_rank: 32 # higher increases accuracy and memory
26+
lora_alpha: 64 # usually alpha=2*rank
2727

2828
teacher_model:
2929
_component_: torchtune.models.qwen2.qwen2_1_5b

0 commit comments

Comments
 (0)