allenai · ananyahjha93 · Jun 28, 2024 · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Expose memmap dtype in data config 
 - Added support for DDP training.
 - Added caching to disk of HF datasets used in downstream evals
+- Added FLOPs logging
+- Added configs for OLMo tiny set of models
 
 ### Changed
 

diff --git a/configs/tiny/OLMo-150M.yaml b/configs/tiny/OLMo-150M.yaml
diff --git a/configs/tiny/OLMo-20M.yaml b/configs/tiny/OLMo-20M.yaml
diff --git a/configs/tiny/OLMo-300M.yaml b/configs/tiny/OLMo-300M.yaml
@@ -4,7 +4,7 @@ dry_run: false
 
 wandb:
   name: ${run_name}
-  project: tiny_olmo
+  project: olmo-tiny
 
 model:
   d_model: 1024
@@ -14,80 +14,85 @@ model:
   weight_tying: false
   alibi: false
   rope: true
-  flash_attention: true  # not available on AMD
+  flash_attention: true
   attention_dropout: 0.0
   attention_layer_norm: false
-  multi_query_attention: true
-  n_kv_heads: 1
-  clip_qkv: 8.0
+  clip_qkv: null
   include_bias: false
   block_type: sequential
-  layer_norm_type: default
-  layer_norm_with_affine: false
+  layer_norm_type: rms
+  layer_norm_with_affine: true
+  layer_norm_eps: 1e-6
   bias_for_layer_norm: false
   attention_layer_norm_with_affine: false
   activation_type: swiglu
   residual_dropout: 0.0
   embedding_dropout: 0.0
-  max_sequence_length: 2048
+  max_sequence_length: 4096
   vocab_size: 50280
   embedding_size: 50304
-  eos_token_id: 50279
+  eos_token_id: 0
   pad_token_id: 1
   init_device: cuda
   init_fn: normal
+  init_std: 0.02
+  init_cutoff_factor: 3
 
 ddp:
   grad_sync_mode: batch
   find_unused_params: false
 
-compile: null  # causes instability on AMD GPUs
+compile: null
 
 optimizer:
   name: adamw
   learning_rate: 6.0e-4
   weight_decay: 0.1
+  eps: 1e-8
+  decay_norm_and_bias: true
+  decay_embeddings: false
   betas:
   - 0.9
   - 0.95
   metrics_log_interval: 10
 
 scheduler:
   name: cosine_with_warmup
-  t_warmup: 2000
+  t_warmup: 5000
   alpha_f: 0.1
+  warmup_min_lr: 0
 
 tokenizer:
-  identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
+  identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json
   truncate_direction: right
 
 save_folder: workspace/${run_name}  # doesn't matter since we'll upload to S3
-remote_save_folder: s3://allennlp-ananyaj/olmo-tiny/300M/${run_name}
+remote_save_folder: s3://ai2-llm/checkpoints/olmo-tiny/${run_name}
 save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 5000
-save_num_checkpoints_to_keep: 3
+
 # Unsharded checkpoints (for ddp)
 save_interval_unsharded: 5000
-save_num_unsharded_checkpoints_to_keep: 3
+save_num_unsharded_checkpoints_to_keep: -1
 
 load_path: null
 
-max_duration: 100_000  # 419B tokens, this is for the scheduler
-stop_at: 100_000
-global_train_batch_size: 2048
-device_train_microbatch_size: 8
+max_duration: 1ep
+stop_at: 406_934
+global_train_batch_size: 1024
+device_train_microbatch_size: 4
 
 precision: amp_bf16
 distributed_strategy: ddp
 
+gen1_gc_interval: 1
+
 max_grad_norm: 1.0
 max_grad_norm_ratio: null
 
 speed_monitor:
   window_size: 20
 
-eval_interval: 1000
+eval_interval: 5000
 eval_subset_num_batches: -1
 device_eval_batch_size: ${device_train_microbatch_size}
 evaluators:
@@ -133,30 +138,22 @@ evaluators:
 
   - label: openbook_qa
     type: downstream
-
-  - label: boolq
-    type: downstream
 
   - label: sciq
     type: downstream
 
   - label: arc_easy
     type: downstream
 
-  - label: arc_challenge
-    type: downstream
-
-  - label: copa
-    type: downstream
-
   - label: commonsense_qa
     type: downstream
 
   - label: social_iqa
     type: downstream
 
-  - label: basic_arithmetic
-    type: downstream
+  # Doesn't work from cache.
+  # - label: basic_arithmetic
+  #   type: downstream
 
   - label: mmlu_stem_var
     type: downstream
@@ -170,6 +167,42 @@ evaluators:
   - label: mmlu_other_var
     type: downstream
 
+  - label: mmlu_stem_mc_5shot
+    type: downstream
+
+  - label: mmlu_humanities_mc_5shot
+    type: downstream
+
+  - label: mmlu_social_sciences_mc_5shot
+    type: downstream
+
+  - label: mmlu_other_mc_5shot
+    type: downstream
+
+  - label: mmlu_stem_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_humanities_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_social_sciences_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_other_mc_5shot_test
+    type: downstream
+
+  - label: basic_arithmetic
+    type: downstream
+
+  - label: trivia_qa_wiki_ppl
+    type: downstream
+
+  - label: natural_qs_open_ppl
+    type: downstream
+
+  - label: arc_easy_ppl
+    type: downstream
+
 data:
   pad_direction: right
   num_workers: 32
@@ -178,6 +211,10 @@ data:
   prefetch_factor: 8
   persistent_workers: true
   timeout: 0
+  instance_filter:
+    repetition_max_period: 13
+    repetition_min_period: 1
+    repetition_max_count: 32
   paths:
     ######### NON WEB DATA #########
     # ~> GUTENBERG BOOKS (5.256 GT)