Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions megatron/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,10 @@ def pretrain(train_valid_test_dataset_provider,
# Model, optimizer, and learning rate.
timers('model-and-optimizer-setup').start()
model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
print_rank_0(f'estimated model parameters: {get_parameters_in_billions(model)}')
print_rank_0(f'estimated model parameters without embeddings: {get_parameters_in_billions(model, exclude_embeddings=True)}')
args.parameters_in_billions = get_parameters_in_billions(model)
args.parameters_in_billions_no_embedding = get_parameters_in_billions(model, exclude_embeddings=True)
print_rank_0(f'estimated model parameters: {args.parameters_in_billions}')
print_rank_0(f'estimated model parameters without embeddings: {args.parameters_in_billions_no_embedding}')
Comment thread
jaketae marked this conversation as resolved.
Outdated
timers('model-and-optimizer-setup').stop()
print_datetime('after model, optimizer, and learning rate '
'scheduler are built')
Expand Down Expand Up @@ -740,7 +742,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
tp_rank = mpu.get_tensor_model_parallel_rank()
pp_rank = mpu.get_pipeline_model_parallel_rank()
preamble = f"[{tp_rank:0>3d}-{pp_rank:0>3d}]"
print(f"{preamble} {get_parameters_in_billions(model):.4f}B / {get_parameters_in_billions(model, exclude_embeddings=True):.4f}B")
print(f"{preamble} {args.parameters_in_billions:.4f}B / {args.parameters_in_billions_no_embedding:.4f}B")
Comment thread
jaketae marked this conversation as resolved.
Outdated
Comment thread
stas00 marked this conversation as resolved.
Outdated
torch.distributed.barrier()
else:
torch.distributed.barrier()
Expand Down Expand Up @@ -815,7 +817,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
args.consumed_train_tokens += new_samples * args.curriculum_seqlen
else:
args.consumed_train_tokens += new_samples * args.seq_length
args.gigaflos_no_embeds += (6 * new_samples * args.seq_length * get_parameters_in_billions(model, exclude_embeddings=True))
args.gigaflos_no_embeds += (6 * new_samples * args.seq_length * args.parameters_in_billions_no_embedding)

# Logging.
if args.deepspeed:
Expand Down