Fix GPU tests CI (#574)

allenai · May 13, 2024 · 5c721cc · 5c721cc
1 parent 467adcc
commit 5c721cc
Show file tree

Hide file tree

Showing 5 changed files with 13 additions and 11 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -144,7 +144,7 @@ jobs:
                 envVars:
                   - name: COMMIT_SHA
                     value: ${{ env.COMMIT_SHA }}
-                  - name: GITHUB_TOKEN
+                  - name: TMP_GITHUB_TOKEN
                     value: ${{ secrets.GITHUB_TOKEN }}
                   - name: CUDA_LAUNCH_BLOCKING
                     value: "1"

diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base
@@ -1,7 +1,7 @@
 # Defines a CUDA-enabled Docker image suitable for installing all dependencies
 # to this project.
 
-FROM ghcr.io/allenai/pytorch:2.0.0-cuda11.8-python3.10
+FROM ghcr.io/allenai/pytorch:2.2.2-cuda11.8-python3.11
 
 # Install flash attn (and triton dependency) from our pre-built wheel.
 # We need cuda dev for the old version of triton.

diff --git a/olmo/checkpoint.py b/olmo/checkpoint.py
@@ -55,7 +55,9 @@
     gc_cuda,
     get_fs_local_rank,
     get_global_rank,
-    get_world_size, get_local_world_size, get_local_rank,
+    get_local_rank,
+    get_local_world_size,
+    get_world_size,
 )
 from .util import (
     _get_s3_client,

diff --git a/olmo/train.py b/olmo/train.py
@@ -19,11 +19,10 @@
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
+import wandb
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.utils.data import DataLoader
 
-import wandb
-
 from .aliases import PathOrStr
 from .checkpoint import Checkpointer, FullCheckpointer, build_sharded_checkpointer
 from .config import (
@@ -641,7 +640,7 @@ def model_forward(
     def train_batch(self, batch: Dict[str, Any]) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         # Split into micro-batches.
         micro_batches = self.split_batch(batch)
-        batch_size_in_tokens = batch['input_ids'].numel()
+        batch_size_in_tokens = batch["input_ids"].numel()
 
         # In case this helps with memory utilization.
         del batch
@@ -652,9 +651,7 @@ def train_batch(self, batch: Dict[str, Any]) -> Tuple[torch.Tensor, Optional[tor
             with torch.autocast("cuda", enabled=True, dtype=self.cfg.autocast_precision):
                 # Run forward pass.
                 ce_loss, z_loss, logits = self.model_forward(
-                    micro_batch,
-                    compute_z_loss=self.cfg.softmax_auxiliary_loss,
-                    loss_reduction="sum"
+                    micro_batch, compute_z_loss=self.cfg.softmax_auxiliary_loss, loss_reduction="sum"
                 )
                 ce_loss = ce_loss / batch_size_in_tokens
 
@@ -836,7 +833,8 @@ def format_float(value: float) -> str:
                 [
                     f"    {name}={format_float(value)}"
                     for name, value in metrics.items()
-                    if name == "optim/total_grad_norm" or not name.startswith("optim/")  # there's too many optimizer metrics
+                    if name == "optim/total_grad_norm"
+                    or not name.startswith("optim/")  # there's too many optimizer metrics
                 ]
             )
         )

diff --git a/scripts/test_entrypoint.sh b/scripts/test_entrypoint.sh
@@ -5,13 +5,15 @@ set -e
 set -o pipefail
 
 # Check that the environment variables have been set correctly
-for env_var in "$GITHUB_TOKEN" "$COMMIT_SHA"; do
+for env_var in "$TMP_GITHUB_TOKEN" "$COMMIT_SHA"; do
     if [[ -z "$env_var" ]]; then
         echo >&2 "error: required environment variable $env_var is empty"
         exit 1
     fi
 done
 
+export "GITHUB_TOKEN=${TMP_GITHUB_TOKEN}"
+
 # Initialize conda for bash.
 # See https://stackoverflow.com/a/58081608/4151392
 eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"