Skip to content

Commit

Permalink
Fix GPU tests CI (#574)
Browse files Browse the repository at this point in the history
  • Loading branch information
epwalsh authored May 13, 2024
1 parent 467adcc commit 5c721cc
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ jobs:
envVars:
- name: COMMIT_SHA
value: ${{ env.COMMIT_SHA }}
- name: GITHUB_TOKEN
- name: TMP_GITHUB_TOKEN
value: ${{ secrets.GITHUB_TOKEN }}
- name: CUDA_LAUNCH_BLOCKING
value: "1"
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.base
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Defines a CUDA-enabled Docker image suitable for installing all dependencies
# to this project.

FROM ghcr.io/allenai/pytorch:2.0.0-cuda11.8-python3.10
FROM ghcr.io/allenai/pytorch:2.2.2-cuda11.8-python3.11

# Install flash attn (and triton dependency) from our pre-built wheel.
# We need cuda dev for the old version of triton.
Expand Down
4 changes: 3 additions & 1 deletion olmo/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@
gc_cuda,
get_fs_local_rank,
get_global_rank,
get_world_size, get_local_world_size, get_local_rank,
get_local_rank,
get_local_world_size,
get_world_size,
)
from .util import (
_get_s3_client,
Expand Down
12 changes: 5 additions & 7 deletions olmo/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@
import torch
import torch.distributed as dist
import torch.nn.functional as F
import wandb
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.utils.data import DataLoader

import wandb

from .aliases import PathOrStr
from .checkpoint import Checkpointer, FullCheckpointer, build_sharded_checkpointer
from .config import (
Expand Down Expand Up @@ -641,7 +640,7 @@ def model_forward(
def train_batch(self, batch: Dict[str, Any]) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
# Split into micro-batches.
micro_batches = self.split_batch(batch)
batch_size_in_tokens = batch['input_ids'].numel()
batch_size_in_tokens = batch["input_ids"].numel()

# In case this helps with memory utilization.
del batch
Expand All @@ -652,9 +651,7 @@ def train_batch(self, batch: Dict[str, Any]) -> Tuple[torch.Tensor, Optional[tor
with torch.autocast("cuda", enabled=True, dtype=self.cfg.autocast_precision):
# Run forward pass.
ce_loss, z_loss, logits = self.model_forward(
micro_batch,
compute_z_loss=self.cfg.softmax_auxiliary_loss,
loss_reduction="sum"
micro_batch, compute_z_loss=self.cfg.softmax_auxiliary_loss, loss_reduction="sum"
)
ce_loss = ce_loss / batch_size_in_tokens

Expand Down Expand Up @@ -836,7 +833,8 @@ def format_float(value: float) -> str:
[
f" {name}={format_float(value)}"
for name, value in metrics.items()
if name == "optim/total_grad_norm" or not name.startswith("optim/") # there's too many optimizer metrics
if name == "optim/total_grad_norm"
or not name.startswith("optim/") # there's too many optimizer metrics
]
)
)
Expand Down
4 changes: 3 additions & 1 deletion scripts/test_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@ set -e
set -o pipefail

# Check that the environment variables have been set correctly
for env_var in "$GITHUB_TOKEN" "$COMMIT_SHA"; do
for env_var in "$TMP_GITHUB_TOKEN" "$COMMIT_SHA"; do
if [[ -z "$env_var" ]]; then
echo >&2 "error: required environment variable $env_var is empty"
exit 1
fi
done

export "GITHUB_TOKEN=${TMP_GITHUB_TOKEN}"

# Initialize conda for bash.
# See https://stackoverflow.com/a/58081608/4151392
eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
Expand Down

0 comments on commit 5c721cc

Please sign in to comment.