Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use PackedSeqParams in accordance with changes in Megatron-LM #8205

Merged
merged 5 commits into from
Jan 23, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@
from contextlib import nullcontext
from dataclasses import fields
from functools import partial
from importlib.metadata import version
from typing import Any, Dict, Iterator, List, Optional, Union

import torch
from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig
from pkg_resources import packaging
from pytorch_lightning.accelerators import CPUAccelerator
from pytorch_lightning.trainer.trainer import Trainer

Expand Down Expand Up @@ -928,12 +930,24 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
cu_seqlens = cu_seqlens[: cu_seqlens_argmin.item()]
else:
cu_seqlens = cu_seqlens[: torch.argmin(cu_seqlens)]
forward_args['cu_seqlens_q'] = cu_seqlens
forward_args['cu_seqlens_kv'] = cu_seqlens
if max_seqlen is not None:
forward_args['max_seqlen_q'] = max_seqlen
forward_args['max_seqlen_kv'] = max_seqlen
forward_args['qkv_format'] = 'thd'

try:
from megatron.core.packed_seq_params import PackedSeqParams
except (ImportError, ModuleNotFoundError) as e:
mcore_version = packaging.version.Version(version('megatron-core'))
logging.error(
f"megatron-core v{mcore_version} does not support training with packed sequence. "
"Please use megatron-core >= 0.5.0, or set model.data.train_ds.packed_sequence=False"
)
raise e

forward_args['packed_seq_params'] = PackedSeqParams(
cu_seqlens_q=cu_seqlens,
cu_seqlens_kv=cu_seqlens,
max_seqlen_q=max_seqlen,
max_seqlen_kv=max_seqlen,
qkv_format='thd',
)

output_tensor = model(**forward_args)

Expand Down
Loading