allenai · dirkgr · Mar 21, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 27, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -35,11 +35,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added the option to directly pass input embeddings to `OLMo` and `OLMoForCausalLM`.
 - Added support for Python 3.8.
 - Added code to throw an error if `output_attentions` is set to `True` in forward call to `OLMoForCausalLM`. This functionality hasn't been implemented yet.
-- Fixed running with data loading workers on LUMI
+- Correct scheme displayed in error messages that come from R2
+- Fixed running with multiple data loading workers in LUMI
 - Minor bug fix: uninitialized prompts variable
 
 ### Added
 - Added `output_hidden_states` argument and associated functionality to `OLMo` and `OLMoForCausalLM` to return model intermediate hidden states.
+- Ability to read from R2 like we read from S3
 - Added MMLU downstream evaluation tasks, with prompt variations.
 - Added support for PyTorch v2.2.
 - Added ability to show logs from all ranks

diff --git a/configs/mcli/olmo7-ablation-baseline.yaml b/configs/mcli/olmo7-ablation-baseline.yaml
@@ -0,0 +1,47 @@
+name: olmo7-ablation-baseline  # can't have "_" or "." here
+image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
+compute:
+  gpus: 64
+  cluster: r7z2
+  gpu_type: a100_40gb
+integrations:
+  - integration_type: git_repo
+    git_repo: allenai/OLMo
+    git_branch: olmo7-ablations
+    #git_commit: d765e8819f5b0be204c96b0b519de2372b0da729
+    pip_install: -e .[train]
+    ssh_clone: true
+command: |-
+  pip freeze
+  mkdir -p /root/.cache/torch/
+
+  export OMP_NUM_THREADS=8
+  export LOG_FILTER_TYPE=all_ranks
+  #export OLMO_NO_SSL=1
+
+  # warm up huggingface cache
+  pushd /root/.cache
+  curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache.tar.gz" | tar -xzf -
+  popd
+  export HF_DATASETS_OFFLINE=1
+
+  cd OLMo
+
+  torchrun \
+  --master_addr $MASTER_ADDR \
+  --master_port $MASTER_PORT \
+  --nnodes $NUM_NODES \
+  --node_rank $NODE_RANK \
+  --nproc_per_node 8 \
+  scripts/train.py configs/olmo7-ablation-baseline.yaml \
+    --run_name=olmo7-ablation-baseline \
+    --wandb.name=baseline \
+    --model.flash_attention=true \
+    --fsdp.wrapping_strategy=by_block_and_size \
+    --fsdp.sharding_strategy=FULL_SHARD \
+    --save_folder=runs/ \
+    --activation_checkpointing=whole_layer \
+    --device_train_microbatch_size=3 \
+    --global_train_batch_size=6144 \
+    --wandb.group=baseline3 \
+    --remote_save_folder=s3://ai2-llm/checkpoints/olmo7-ablation/baseline3
diff --git a/configs/mcli/olmo7-ablation-dedupedocs.yaml b/configs/mcli/olmo7-ablation-dedupedocs.yaml
@@ -0,0 +1,46 @@
+name: olmo7-ablation-dedupedocs  # can't have "_" or "." here
+image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
+compute:
+  gpus: 64
+  cluster: r14z3p2
+  gpu_type: h100_80gb
+integrations:
+  - integration_type: git_repo
+    git_repo: allenai/OLMo
+    git_branch: olmo7-ablations
+    #git_commit: d765e8819f5b0be204c96b0b519de2372b0da729
+    pip_install: -e .[train]
+    ssh_clone: true
+command: |-
+  pip freeze
+  mkdir -p /root/.cache/torch/
+
+  export OMP_NUM_THREADS=8
+  export LOG_FILTER_TYPE=all_ranks
+  #export OLMO_NO_SSL=1
+
+  # warm up huggingface cache
+  pushd /root/.cache
+  curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache.tar.gz" | tar -xzf -
+  popd
+  export HF_DATASETS_OFFLINE=1
+
+  cd OLMo
+
+  torchrun \
+  --master_addr $MASTER_ADDR \
+  --master_port $MASTER_PORT \
+  --nnodes $NUM_NODES \
+  --node_rank $NODE_RANK \
+  --nproc_per_node 8 \
+  scripts/train.py configs/olmo7-ablation-dedupedocs.yaml \
+    --run_name=olmo7-ablation-dedupedocs \
+    --wandb.name=dedupedocs \
+    --model.flash_attention=true \
+    --fsdp.wrapping_strategy=by_block_and_size \
+    --fsdp.sharding_strategy=SHARD_GRAD_OP \
+    --save_folder=runs/ \
+    --device_train_microbatch_size=3 \
+    --global_train_batch_size=6144 \
+    --wandb.group=dedupedocs \
+    --remote_save_folder=s3://ai2-llm/checkpoints/olmo7-ablation/dedupedocs
diff --git a/configs/mcli/olmo7-ablation-dolma17.yaml b/configs/mcli/olmo7-ablation-dolma17.yaml
@@ -0,0 +1,47 @@
+name: olmo7-ablation-dolma17  # can't have "_" or "." here
+image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
+compute:
+  gpus: 128
+  cluster: r12z3
+  gpu_type: a100_40gb
+integrations:
+  - integration_type: git_repo
+    git_repo: allenai/OLMo
+    git_branch: olmo7-ablations
+    #git_commit: d765e8819f5b0be204c96b0b519de2372b0da729
+    pip_install: -e .[train]
+    ssh_clone: true
+command: |-
+  pip freeze
+  mkdir -p /root/.cache/torch/
+
+  export OMP_NUM_THREADS=8
+  export LOG_FILTER_TYPE=all_ranks
+  #export OLMO_NO_SSL=1
+
+  # warm up huggingface cache
+  pushd /root/.cache
+  curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache.tar.gz" | tar -xzf -
+  popd
+  export HF_DATASETS_OFFLINE=1
+
+  cd OLMo
+
+  torchrun \
+  --master_addr $MASTER_ADDR \
+  --master_port $MASTER_PORT \
+  --nnodes $NUM_NODES \
+  --node_rank $NODE_RANK \
+  --nproc_per_node 8 \
+  scripts/train.py configs/olmo7-ablation-dolma17.yaml \
+    --run_name=olmo7-ablation-dolma17 \
+    --wandb.name=dolma17 \
+    --model.flash_attention=true \
+    --fsdp.wrapping_strategy=by_block_and_size \
+    --fsdp.sharding_strategy=FULL_SHARD \
+    --save_folder=runs/ \
+    --activation_checkpointing=whole_layer \
+    --device_train_microbatch_size=3 \
+    --global_train_batch_size=6144 \
+    --wandb.group=dolma17 \
+    --remote_save_folder=s3://ai2-llm/checkpoints/olmo7-ablation/dolma17