Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ptuning oom fix #6916

Merged
merged 4 commits into from
Jun 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,9 @@ def __len__(self):
def __getitem__(self, idx):
return self.examples[idx]

def _ceil_to_nearest(self, n, m):
return (n + m - 1) // m * m

def collate_fn(self, batch, tp_workers=0):
""" Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch """
taskname_ids, input_ids, answer_starts = zip(*batch)
Expand All @@ -350,11 +353,16 @@ def collate_fn(self, batch, tp_workers=0):
else:
resi_padding = 0
batch_max += resi_padding
ceil_batch_max = self._ceil_to_nearest(
batch_max, 8
) # @adithyare this padding does not conflict with the tp_workers padding above
# since tp_workers is always a multiple of 2. the padding to multiple of 8 is to ensure an mem-optimized softmax is used.
batch_max = ceil_batch_max + 1
input_ids, loss_mask = self.pad_batch_and_build_loss_mask(input_ids, batch_max, answer_starts)
# Should be a label for every token in batch, label is the next token
labels = input_ids[:, 1:].contiguous()
input_ids = input_ids[:, :-1].contiguous()
batch_max -= 1
batch_max -= 1 # @adithyare I *think* this negatition is done to account for the above 2 lines which removes one item from the input_ids seq.

# Loss mask should align with labels
loss_mask = loss_mask[:, 1:].contiguous()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def __init__(
AdapterName.LORA_KQV_ADAPTER,
]
lora_cfg = cfg.peft.lora_tuning
if cfg.kv_channels is None:
if cfg.get("kv_channels", None) is None:
assert (
cfg.hidden_size % cfg.num_attention_heads == 0
), 'hidden_size must be divisible by num_attention_heads if kv_channels is None'
Expand Down