-
Notifications
You must be signed in to change notification settings - Fork 516
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #629 from allenai/epwalsh/amberish
Amberish runs
- Loading branch information
Showing
38 changed files
with
9,454 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -ex | ||
|
||
NUM_NODES=16 | ||
|
||
gantry run \ | ||
--workspace ai2/OLMo-pretraining-stability \ | ||
--task-name amberish1-8k-cham \ | ||
--description "Amberish 1B with 8k context length and chameleon fixes" \ | ||
--priority urgent \ | ||
--preemptible \ | ||
--beaker-image petew/olmo-torch23-gantry \ | ||
--cluster ai2/jupiter-cirrascale-2 \ | ||
--gpus 8 \ | ||
--replicas "${NUM_NODES}" \ | ||
--leader-selection \ | ||
--host-networking \ | ||
--budget ai2/oe-training \ | ||
--no-nfs \ | ||
--weka oe-training-default:/weka/oe-training-default \ | ||
--propagate-failure \ | ||
--propagate-preemption \ | ||
--synchronized-start-timeout 90m \ | ||
--no-python \ | ||
--env LOG_FILTER_TYPE=local_rank0_only \ | ||
--env OMP_NUM_THREADS=8 \ | ||
--env OLMO_TASK=model \ | ||
--env R2_PROFILE=R2 \ | ||
--env S3_PROFILE=S3 \ | ||
--env WEKA_PROFILE=WEKA \ | ||
--env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ | ||
--env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ | ||
--env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ | ||
--env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ | ||
--env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ | ||
--shared-memory 10GiB \ | ||
--yes \ | ||
--timeout=-1 \ | ||
-- /bin/bash -c "scripts/beaker/amberish/amberish1-8k-cham.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -exuo pipefail | ||
IFS=$'\n\t' | ||
|
||
BEAKER_LEADER_REPLICA_HOSTNAME=$1 | ||
shift | ||
|
||
NUM_NODES=$1 | ||
shift | ||
|
||
BEAKER_REPLICA_RANK=$1 | ||
shift | ||
|
||
# Setup Python environment. | ||
conda shell.bash activate base | ||
|
||
# Install flash-attn | ||
#conda install -y -c nvidia cuda-python | ||
pip install packaging ninja | ||
export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE | ||
pip install flash-attn==2.5.9.post1 --no-build-isolation | ||
# pip install awscli | ||
pip install '.[train]' | ||
pip freeze | ||
|
||
# Move AWS credentials from env to relevant files | ||
mkdir -p ~/.aws | ||
printenv AWS_CONFIG > ~/.aws/config | ||
printenv AWS_CREDENTIALS > ~/.aws/credentials | ||
|
||
# Force processes to synchronize at init_process_group | ||
export TORCH_DIST_INIT_BARRIER=1 | ||
|
||
# Tell OLMo all ranks share the same filesystem for checkpoints. | ||
export OLMO_SHARED_FS=1 | ||
|
||
export NCCL_DEBUG=INFO | ||
export NCCL_IB_HCA="^=mlx5_bond_0" | ||
export NCCL_SOCKET_IFNAME=ib | ||
# export NCCL_IB_GID_INDEX=0 | ||
|
||
torchrun \ | ||
--nnodes "${NUM_NODES}:${NUM_NODES}" \ | ||
--nproc-per-node 8 \ | ||
--rdzv_id 12347 \ | ||
--rdzv_backend static \ | ||
--rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ | ||
--node_rank "${BEAKER_REPLICA_RANK}" \ | ||
--rdzv_conf 'read_timeout=420' \ | ||
scripts/train.py \ | ||
configs/amberish1-weka.yaml \ | ||
--run_name="${GANTRY_TASK_NAME}" \ | ||
--model.max_sequence_length=8192 \ | ||
--device_train_microbatch_size=2 \ | ||
--global_train_batch_size=512 \ | ||
--fused_loss=true \ | ||
--softmax_auxiliary_loss=true \ | ||
--auxiliary_loss_multiplier=1e-5 \ | ||
--model.attention_layer_norm=true \ | ||
--model.norm_after=true \ | ||
--save_overwrite | ||
|
||
# '--load_path=${path.last_checkpoint:${save_folder}}' \ |
40 changes: 40 additions & 0 deletions
40
scripts/beaker/amberish/amberish1-8k-doc-mask-cham-launch.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -ex | ||
|
||
NUM_NODES=16 | ||
|
||
gantry run \ | ||
--workspace ai2/OLMo-pretraining-stability \ | ||
--task-name amberish1-8k-doc-mask-cham \ | ||
--description "Amberish 1B with 8k context length, doc masking, and chameleon fixes" \ | ||
--priority urgent \ | ||
--preemptible \ | ||
--beaker-image petew/olmo-torch23-gantry \ | ||
--cluster ai2/jupiter-cirrascale-2 \ | ||
--gpus 8 \ | ||
--replicas "${NUM_NODES}" \ | ||
--leader-selection \ | ||
--host-networking \ | ||
--budget ai2/oe-training \ | ||
--no-nfs \ | ||
--weka oe-training-default:/weka/oe-training-default \ | ||
--propagate-failure \ | ||
--propagate-preemption \ | ||
--synchronized-start-timeout 90m \ | ||
--no-python \ | ||
--env LOG_FILTER_TYPE=local_rank0_only \ | ||
--env OMP_NUM_THREADS=8 \ | ||
--env OLMO_TASK=model \ | ||
--env R2_PROFILE=R2 \ | ||
--env S3_PROFILE=S3 \ | ||
--env WEKA_PROFILE=WEKA \ | ||
--env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ | ||
--env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ | ||
--env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ | ||
--env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ | ||
--env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ | ||
--shared-memory 10GiB \ | ||
--yes \ | ||
--timeout=-1 \ | ||
-- /bin/bash -c "scripts/beaker/amberish/amberish1-8k-doc-mask-cham.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" |
40 changes: 40 additions & 0 deletions
40
scripts/beaker/amberish/amberish1-8k-doc-mask-cham-rtheta-launch.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -ex | ||
|
||
NUM_NODES=16 | ||
|
||
gantry run \ | ||
--workspace ai2/OLMo-pretraining-stability \ | ||
--task-name amberish1-8k-doc-mask-cham-rtheta \ | ||
--description "Amberish 1B with 8k context length, doc masking, and chameleon fixes" \ | ||
--priority urgent \ | ||
--preemptible \ | ||
--beaker-image petew/olmo-torch23-gantry \ | ||
--cluster ai2/jupiter-cirrascale-2 \ | ||
--gpus 8 \ | ||
--replicas "${NUM_NODES}" \ | ||
--leader-selection \ | ||
--host-networking \ | ||
--budget ai2/oe-training \ | ||
--no-nfs \ | ||
--weka oe-training-default:/weka/oe-training-default \ | ||
--propagate-failure \ | ||
--propagate-preemption \ | ||
--synchronized-start-timeout 90m \ | ||
--no-python \ | ||
--env LOG_FILTER_TYPE=local_rank0_only \ | ||
--env OMP_NUM_THREADS=8 \ | ||
--env OLMO_TASK=model \ | ||
--env R2_PROFILE=R2 \ | ||
--env S3_PROFILE=S3 \ | ||
--env WEKA_PROFILE=WEKA \ | ||
--env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ | ||
--env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ | ||
--env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ | ||
--env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ | ||
--env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ | ||
--shared-memory 10GiB \ | ||
--yes \ | ||
--timeout=-1 \ | ||
-- /bin/bash -c "scripts/beaker/amberish/amberish1-8k-doc-mask-cham-rtheta.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" |
66 changes: 66 additions & 0 deletions
66
scripts/beaker/amberish/amberish1-8k-doc-mask-cham-rtheta.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -exuo pipefail | ||
IFS=$'\n\t' | ||
|
||
BEAKER_LEADER_REPLICA_HOSTNAME=$1 | ||
shift | ||
|
||
NUM_NODES=$1 | ||
shift | ||
|
||
BEAKER_REPLICA_RANK=$1 | ||
shift | ||
|
||
# Setup Python environment. | ||
conda shell.bash activate base | ||
|
||
# Install flash-attn | ||
#conda install -y -c nvidia cuda-python | ||
pip install packaging ninja | ||
export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE | ||
pip install flash-attn==2.5.9.post1 --no-build-isolation | ||
# pip install awscli | ||
pip install '.[train]' | ||
pip freeze | ||
|
||
# Move AWS credentials from env to relevant files | ||
mkdir -p ~/.aws | ||
printenv AWS_CONFIG > ~/.aws/config | ||
printenv AWS_CREDENTIALS > ~/.aws/credentials | ||
|
||
# Force processes to synchronize at init_process_group | ||
export TORCH_DIST_INIT_BARRIER=1 | ||
|
||
# Tell OLMo all ranks share the same filesystem for checkpoints. | ||
export OLMO_SHARED_FS=1 | ||
|
||
export NCCL_DEBUG=INFO | ||
export NCCL_IB_HCA="^=mlx5_bond_0" | ||
export NCCL_SOCKET_IFNAME=ib | ||
# export NCCL_IB_GID_INDEX=0 | ||
|
||
torchrun \ | ||
--nnodes "${NUM_NODES}:${NUM_NODES}" \ | ||
--nproc-per-node 8 \ | ||
--rdzv_id 12347 \ | ||
--rdzv_backend static \ | ||
--rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ | ||
--node_rank "${BEAKER_REPLICA_RANK}" \ | ||
--rdzv_conf 'read_timeout=420' \ | ||
scripts/train.py \ | ||
configs/amberish1-weka.yaml \ | ||
--run_name="${GANTRY_TASK_NAME}" \ | ||
--model.max_sequence_length=8192 \ | ||
--device_train_microbatch_size=2 \ | ||
--global_train_batch_size=512 \ | ||
--fused_loss=true \ | ||
--data.generate_doc_lengths=true \ | ||
--softmax_auxiliary_loss=true \ | ||
--auxiliary_loss_multiplier=1e-5 \ | ||
--model.attention_layer_norm=true \ | ||
--model.norm_after=true \ | ||
--model.rope_theta=500000 \ | ||
--save_overwrite | ||
|
||
# '--load_path=${path.last_checkpoint:${save_folder}}' \ |
Oops, something went wrong.