forked from PiotrNawrot/nanoT5
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdefault.yaml
105 lines (95 loc) · 2 KB
/
default.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
defaults:
- _self_
- task: pt
- local_env: default
# Experiment args
mode: 'pt'
device: gpu
precision: 'bf16'
eval_only: false
predict_only: false
seed: 2137
wandb: null # null to deactivate
aim:
repo: /home/ucloud/nanoT5
experiment: testT5 # null to deactivate
sync_repo: null #aim://aim.caracal.imada.sdu.dk
sync_args:
repeat: 60
verbosity: 1
quantization_warmup_steps: 1 #16384
quantization_warmup_offset: 0 #16384
quantization_warmup_prequantize: true
bitlinear: [] # [] to deactivate
# - layer_norm: false
# - activation_measure: null
# match_name: DenseReluDense.wo
# layer_norm: true
# weight_measure: null
# - activation_measure: null
# match_name: Attention.o
# layer_norm: true
# weight_measure: null
model:
klass: local_t5 # T6 = trainable text-to-text transfer transformer
name: 'google/t5-v1_1-base'
overwrite:
dropout_rate: 0.0
num_layers: 12
num_decoder_layers: 12
num_heads: 12
d_kv: 64
d_model: 768
d_ff: 2048
add_config:
is_bf16: false
checkpoint_path: ''
random_init: true
compile: true
tokenizer: 'google/t5-v1_1-base'
load_weights: true
load_optimizer: false
load_scheduler: false
load_accelerator: false
data:
input_length: 512
mlm_probability: 0.15
mean_noise_span_length: 3.0
num_workers: 0
dataset: mc4
validation: null
language: en
remove_columns: [timestamp, url]
streaming: true
optim:
name: adamwscale
base_lr: 2e-2
batch_size: 128
total_steps: 65536
# total_steps: 16384
epochs: -1 # If it's > 0 it overwrites total_steps
warmup_steps: 10000
# warmup_steps: 1000
lr_scheduler: cosine
schedulefree_wrapper: false
weight_decay: 0.0
grad_clip: 1.0
grad_acc: 2
final_cosine: 1e-5
eval:
every_steps: 1024 # Eval once in the end
steps: 16
checkpoint:
every_steps: 16384 # Save checkpoint once in the end
logging:
neptune: false
neptune_creds:
project:
api_token:
tags: ''
every_steps: 16
grad_l2: true
weights_l2: true
hydra:
job:
chdir: True