Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
name: do-the-job
on: workflow_dispatch
jobs:
start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-2
- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/ec2-github-runner@v2
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0c7563df049759e8b
ec2-instance-type: p3.8xlarge
subnet-id: subnet-3502b45e
security-group-id: sg-e8f46d9d
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
do-the-job:
name: Do the job on the runner
needs: start-runner # required to start the main job when the runner is ready
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Hello World
run: echo 'Hello World!'
- name: Test
run: pwd; ls -l
- name: GPU Test
run: nvidia-smi

stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner # required to get output from the start-runner job
- do-the-job # required to wait when the main job is done
runs-on: ubuntu-latest
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-2
- name: Stop EC2 runner
uses: machulav/ec2-github-runner@v2
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# tests
# megatron autogenerated indices
tests/data/*/*npy
tests/tools/openwebtext-1000.jsonl

# macOS
.DS_Store
Expand Down
7 changes: 7 additions & 0 deletions megatron/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import deepspeed

from megatron.enums import PositionEmbeddingType
import megatron


def parse_args(extra_args_provider=None, defaults={},
Expand Down Expand Up @@ -313,6 +314,10 @@ def _add_network_size_args(parser):
default=PositionEmbeddingType.absolute,
help='Define position embedding type ("absolute" | "rotary"). "absolute" by default.'
)
group.add_argument('--glu-activation', type=str,
choices=megatron.model.glu_activations.GLU_ACTIVATIONS.keys(),
help='GLU activations to use.'
)

return parser

Expand Down Expand Up @@ -452,6 +457,8 @@ def _add_training_args(parser):
help='Run optimizer on CPU')
group.add_argument('--cpu_torch_adam', action='store_true',
help='Use Torch Adam as optimizer on CPU.')
group.add_argument('--codecarbon-dir', type=str, default=None,
help='Write CodeCarbon logs to this directory.')

return parser

Expand Down
10 changes: 8 additions & 2 deletions megatron/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import torch

from megatron.global_vars import codecarbon_tracker_flush
from megatron import (get_args,
mpu,
print_rank_0,
Expand Down Expand Up @@ -135,7 +136,7 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
for i in range(len(model)):
mpu.set_virtual_pipeline_model_parallel_rank(i)
state_dict['model%d' % i] = model[i].state_dict_for_save_checkpoint()

# Optimizer stuff.
if not args.no_save_optim:
if optimizer is not None:
Expand Down Expand Up @@ -183,6 +184,11 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
if torch.distributed.is_initialized():
torch.distributed.barrier()

# since the code can be exited or aborted in various places we use the checkpoint saving as
# a save saving point for the codecarbon tracker. If the program doesn't run to its normal
# end, then only the data since the last saved checkpoint will be lost.
codecarbon_tracker_flush()

def _transpose_first_dim(t, num_splits, num_splits_first, model):
input_shape = t.size()
# We use a self_attention module but the values extracted aren't
Expand Down Expand Up @@ -417,7 +423,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
def load_biencoder_checkpoint(model, only_query_model=False,
only_context_model=False, custom_load_path=None):
"""
selectively load retrieval models for indexing/retrieving
selectively load retrieval models for indexing/retrieving
from saved checkpoints
"""

Expand Down
Loading