16
16
# This config is only tested on an 8xA100 machine.
17
17
#
18
18
19
+ output_dir : /tmp/torchtune/llama3_3_70B/full # /tmp may be deleted by your system. Change it to your preference.
20
+
19
21
# Tokenizer
20
22
tokenizer :
21
23
_component_ : torchtune.models.llama3.llama3_tokenizer
@@ -69,7 +71,7 @@ checkpointer:
69
71
model-00030-of-00030.safetensors,
70
72
]
71
73
recipe_checkpoint : null
72
- output_dir : /tmp/Llama-3.3-70B-Instruct/
74
+ output_dir : ${output_dir}
73
75
model_type : LLAMA3
74
76
resume_from_checkpoint : False
75
77
@@ -87,7 +89,7 @@ optimizer:
87
89
loss :
88
90
_component_ : torchtune.modules.loss.CEWithChunkedOutputLoss
89
91
max_steps_per_epoch : null
90
- gradient_accumulation_steps : 1 # Use to increase virtual batch size
92
+ gradient_accumulation_steps : 1 # Use to increase effective batch size
91
93
92
94
93
95
# Training env
@@ -98,7 +100,7 @@ enable_activation_checkpointing: True # True reduces memory
98
100
enable_activation_offloading : False # True reduces memory
99
101
custom_sharded_layers : ['tok_embeddings', 'output'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
100
102
fsdp_cpu_offload : True
101
- compile : False # pytorch compile, set to true for better perf/ memory
103
+ compile : False # torch. compile the model + loss, True increases speed + decreases memory
102
104
optimizer_in_bwd : False # True saves memory. Requires gradient_accumulation_steps=1
103
105
104
106
# Reduced precision
@@ -107,8 +109,7 @@ dtype: bf16
107
109
# Logging
108
110
metric_logger :
109
111
_component_ : torchtune.training.metric_logging.DiskLogger
110
- log_dir : ${output_dir}
111
- output_dir : /tmp/full-llama3_3-finetune
112
+ log_dir : ${output_dir}/logs
112
113
log_every_n_steps : 1
113
114
log_peak_memory_stats : True
114
115
0 commit comments