Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 23 additions & 54 deletions src/transformers/models/opt/modeling_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import Tensor, nn
from torch import nn
from torch.nn import CrossEntropyLoss

from ...activations import ACT2FN
Expand Down Expand Up @@ -86,52 +85,28 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)


def make_positions(mask, padding_idx: int):
"""Replace non-padding symbols with their position numbers.

Position numbers begin at padding_idx+1. Padding symbols are ignored.
"""
# The series of casts and type-conversions here are carefully
# balanced to both work with ONNX export and XLA. In particular XLA
# prefers ints, cumsum defaults to output longs, and ONNX doesn't know
# how to handle the dtype kwarg in cumsum.
positions = (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx
return positions


class OPTLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
the forward function.
This module learns positional embeddings up to a fixed maximum size.
"""

def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int = 1):
super().__init__(num_embeddings, embedding_dim, padding_idx)
self.onnx_trace = False
if self.padding_idx is not None:
self.max_positions = self.num_embeddings - self.padding_idx - 1
else:
self.max_positions = self.num_embeddings

def forward(self, attention_mask: Tensor, positions: Optional[Tensor] = None):
# attention_masks is expected to be of size [batch_size x seq_len].
if not ((positions is None) or (self.padding_idx is None)):
raise ValueError("If positions is pre-computed then padding_idx should not be set.")

if positions is None:
attention_mask = attention_mask.long()
positions = make_positions(attention_mask, self.padding_idx)

return F.embedding(
positions,
self.weight,
self.padding_idx,
self.max_norm,
self.norm_type,
self.scale_grad_by_freq,
self.sparse,
)
def __init__(self, num_embeddings: int, embedding_dim: int):

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cleaned this up a bit. Think we should try to align it as much as possible to Bart here

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks :) , I have to update the tf code based on that I think

# OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)

def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
attention_mask = attention_mask.long()

# create positions depending on attention_mask
positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1

# cut positions if `past_key_values_length` is > 0
positions = positions[:, past_key_values_length:]

return super().forward(positions + self.offset)


# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->OPT
Expand Down Expand Up @@ -504,12 +479,7 @@ def __init__(self, config: OPTConfig):
self.vocab_size = config.vocab_size

self.embed_tokens = nn.Embedding(config.vocab_size, config.word_embed_proj_dim, self.padding_idx)

# OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
if self.padding_idx is not None:
num_embeddings = config.max_position_embeddings + 2

self.embed_positions = OPTLearnedPositionalEmbedding(num_embeddings, config.hidden_size, self.padding_idx)
self.embed_positions = OPTLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)

if config.word_embed_proj_dim != config.hidden_size:
self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False)
Expand Down Expand Up @@ -639,8 +609,7 @@ def forward(
# embed positions
if attention_mask is None:
attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device)

positions = self.embed_positions(attention_mask)[:, past_key_values_length:, :]
pos_embeds = self.embed_positions(attention_mask, past_key_values_length)

attention_mask = self._prepare_decoder_attention_mask(
attention_mask, input_shape, inputs_embeds, past_key_values_length
Expand All @@ -649,8 +618,7 @@ def forward(
if self.project_in is not None:
inputs_embeds = self.project_in(inputs_embeds)

hidden_states = inputs_embeds + positions

hidden_states = inputs_embeds + pos_embeds
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

# decoder layers
Expand All @@ -671,6 +639,7 @@ def forward(
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
all_hidden_states += (hidden_states,)

dropout_probability = random.uniform(0, 1)
if self.training and (dropout_probability < self.layerdrop):
continue
Expand Down
9 changes: 7 additions & 2 deletions tests/models/opt/test_modeling_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,17 +268,22 @@ def _long_tensor(tok_lst):
@require_torch
class OPTModelIntegrationTests(unittest.TestCase):
@slow
@unittest.skipIf(torch_device != "cpu", "Cannot make deterministic on GPU")

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Getting different logits results on GPU depending on PyTorch version (1.10+cu11.0 vs. 1.11+cu11.4) and results also differ between CPU and GPU. Only on CPU it seems to be deterministic.

It's not because the weights are saved & loaded in FP16 - checked that the same happens when weights are stored in fp32 and loaded in fp32.
It's across multiple checkpoints (not just 350m, but also 125m).
The differences start to creep in in the first linear projection matrix project_in_dim for the 350m model and the first attention layer for 125m.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Upgrading precision to 1e-2 so that tests pass on GPU - think that's the best we can do

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting observation:

  • BART has the same problem, but the differences are not as flagrant in the end
  • BERT does not have this problem at all. Results are exactly the same.

cc @stephenroller @stas00 @sgugger @LysandreJik

@stas00 stas00 May 19, 2022

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getting logits to exact match is very difficult at times. Perhaps using a much longer input and checking that softmax output matches - after all logits fluctuations don't matter at the end but 100% softmax match does.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you use a very high tolerance then it becomes questionable whether the test is doing anything.

def test_inference_no_head(self):
# model is not deterministic on GPU, not sure why

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(some) non-deterministic is mentioned
https://pytorch.org/docs/stable/notes/randomness.html

The diff 1e-2 is still unpleasantly large though.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually my comment is not great - the model is deterministic on GPU, but it's just that results differ depending on the CUDA vesion and CUDA != CPU

Comment thread
patrickvonplaten marked this conversation as resolved.
Outdated
model = OPTModel.from_pretrained("facebook/opt-350m").to(torch_device)
input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])

with torch.no_grad():
output = model(input_ids=input_ids).last_hidden_state

expected_shape = torch.Size((1, 11, 512))
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor(
[[-0.2867, -1.9256, -0.3062], [-1.2711, -0.1337, -0.1897], [0.4109, 0.1187, -1.3142]], device=torch_device
[[-0.2873, -1.9242, -0.3059], [-1.2738, -0.1333, -0.1877], [0.4116, 0.1192, -1.3107]],
device=torch_device,
)
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
assert_tensors_close(output[0, :3, :3], expected_slice, atol=1e-3)


@require_torch
Expand Down