From 48efb8ddaf0606b122b498b19581626c607349d3 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 7 Feb 2023 09:56:47 -0500 Subject: [PATCH 01/10] Add files --- .../models/gpt_bigcode/__init__.py | 0 .../gpt_bigcode/configuration_gpt_bigcode.py | 274 +++ .../gpt_bigcode/modeling_gpt_bigcode.py | 1537 +++++++++++++++++ 3 files changed, 1811 insertions(+) create mode 100644 src/transformers/models/gpt_bigcode/__init__.py create mode 100644 src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py create mode 100644 src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py diff --git a/src/transformers/models/gpt_bigcode/__init__.py b/src/transformers/models/gpt_bigcode/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py new file mode 100644 index 0000000000..fe9c711d73 --- /dev/null +++ b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py @@ -0,0 +1,274 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" OpenAI GPT-2 configuration""" +from collections import OrderedDict +from typing import Any, List, Mapping, Optional + +from transformers import PreTrainedTokenizer, TensorType, is_torch_available + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfigWithPast, PatchingSpec +from ...utils import logging + + +logger = logging.get_logger(__name__) + +GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json", + "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json", + "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json", + "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json", + "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json", +} + + +class GPT2Config(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to + instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the GPT-2 + [gpt2](https://huggingface.co/gpt2) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50257): + Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`]. + n_positions (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + n_embd (`int`, *optional*, defaults to 768): + Dimensionality of the embeddings and hidden states. + n_layer (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + n_inner (`int`, *optional*, defaults to None): + Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd + activation_function (`str`, *optional*, defaults to `"gelu"`): + Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`. + resid_pdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (`int`, *optional*, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): + The epsilon to use in the layer normalization layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + summary_type (`string`, *optional*, defaults to `"cls_index"`): + Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and + [`TFGPT2DoubleHeadsModel`]. + + Has to be one of the following options: + + - `"last"`: Take the last token hidden state (like XLNet). + - `"first"`: Take the first token hidden state (like BERT). + - `"mean"`: Take the mean of all tokens hidden states. + - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2). + - `"attn"`: Not implemented now, use multi-head attention. + summary_use_proj (`bool`, *optional*, defaults to `True`): + Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and + [`TFGPT2DoubleHeadsModel`]. + + Whether or not to add a projection after the vector extraction. + summary_activation (`str`, *optional*): + Argument used when doing sequence summary. Used in for the multiple choice head in + [`GPT2DoubleHeadsModel`]. + + Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation. + summary_proj_to_labels (`bool`, *optional*, defaults to `True`): + Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and + [`TFGPT2DoubleHeadsModel`]. + + Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes. + summary_first_dropout (`float`, *optional*, defaults to 0.1): + Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and + [`TFGPT2DoubleHeadsModel`]. + + The dropout ratio to be used after the projection and activation. + scale_attn_weights (`bool`, *optional*, defaults to `True`): + Scale attention weights by dividing by sqrt(hidden_size).. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`): + Whether to additionally scale attention weights by `1 / layer_idx + 1`. + reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): + Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention + dot-product/softmax to float() when training with mixed precision. + + Example: + + ```python + >>> from transformers import GPT2Config, GPT2Model + + >>> # Initializing a GPT2 configuration + >>> configuration = GPT2Config() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = GPT2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "gpt2" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "hidden_size": "n_embd", + "max_position_embeddings": "n_positions", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + vocab_size=50257, + n_positions=1024, + n_embd=768, + n_layer=12, + n_head=12, + n_inner=None, + activation_function="gelu_new", + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + summary_type="cls_index", + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + scale_attn_weights=True, + use_cache=True, + bos_token_id=50256, + eos_token_id=50256, + scale_attn_by_inverse_layer_idx=False, + reorder_and_upcast_attn=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.n_inner = n_inner + self.activation_function = activation_function + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_first_dropout = summary_first_dropout + self.summary_proj_to_labels = summary_proj_to_labels + self.scale_attn_weights = scale_attn_weights + self.use_cache = use_cache + self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx + self.reorder_and_upcast_attn = reorder_and_upcast_attn + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + +class GPT2OnnxConfig(OnnxConfigWithPast): + def __init__( + self, + config: PretrainedConfig, + task: str = "default", + patching_specs: List[PatchingSpec] = None, + use_past: bool = False, + ): + super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past) + if not getattr(self._config, "pad_token_id", None): + # TODO: how to do that better? + self._config.pad_token_id = 0 + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}}) + if self.use_past: + self.fill_with_past_key_values_(common_inputs, direction="inputs") + common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"} + else: + common_inputs["attention_mask"] = {0: "batch", 1: "sequence"} + + return common_inputs + + @property + def num_layers(self) -> int: + return self._config.n_layer + + @property + def num_attention_heads(self) -> int: + return self._config.n_head + + def generate_dummy_inputs( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + # We need to order the input in the way they appears in the forward() + ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]}) + + # Need to add the past_keys + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + + batch, seqlen = common_inputs["input_ids"].shape + # Not using the same length for past_key_values + past_key_values_length = seqlen + 2 + past_shape = ( + batch, + self.num_attention_heads, + past_key_values_length, + self._config.hidden_size // self.num_attention_heads, + ) + ordered_inputs["past_key_values"] = [ + (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers) + ] + + ordered_inputs["attention_mask"] = common_inputs["attention_mask"] + if self.use_past: + mask_dtype = ordered_inputs["attention_mask"].dtype + ordered_inputs["attention_mask"] = torch.cat( + [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1 + ) + + return ordered_inputs + + @property + def default_onnx_opset(self) -> int: + return 13 diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py new file mode 100644 index 0000000000..5fe33bbca5 --- /dev/null +++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -0,0 +1,1537 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch OpenAI GPT-2 model.""" + +import math +import os +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.cuda.amp import autocast +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel, SequenceSummary +from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ...utils.model_parallel_utils import assert_device_map, get_device_map +from .configuration_gpt2 import GPT2Config + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "gpt2" +_CONFIG_FOR_DOC = "GPT2Config" + +GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "gpt2", + "gpt2-medium", + "gpt2-large", + "gpt2-xl", + "distilgpt2", + # See all GPT-2 models at https://huggingface.co/models?filter=gpt2 +] + + +def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): + """Load tf checkpoints in a pytorch model""" + try: + import re + + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(gpt2_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array.squeeze()) + + for name, array in zip(names, arrays): + name = name[6:] # skip "model/" + name = name.split("/") + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+\d+", m_name): + scope_names = re.split(r"(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "w" or scope_names[0] == "g": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "b": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "wpe" or scope_names[0] == "wte": + pointer = getattr(pointer, scope_names[0]) + pointer = getattr(pointer, "weight") + else: + pointer = getattr(pointer, scope_names[0]) + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + try: + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +class GPT2Attention(nn.Module): + def __init__(self, config, is_cross_attention=False, layer_idx=None): + super().__init__() + + max_positions = config.max_position_embeddings + self.register_buffer( + "bias", + torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view( + 1, 1, max_positions, max_positions + ), + ) + self.register_buffer("masked_bias", torch.tensor(-1e4)) + + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + self.split_size = self.embed_dim + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + + self.scale_attn_weights = config.scale_attn_weights + self.is_cross_attention = is_cross_attention + + # Layer-wise attention scaling, reordering, and upcasting + self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx + self.layer_idx = layer_idx + self.reorder_and_upcast_attn = config.reorder_and_upcast_attn + + if self.is_cross_attention: + self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim) + self.q_attn = Conv1D(self.embed_dim, self.embed_dim) + else: + self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim) + self.c_proj = Conv1D(self.embed_dim, self.embed_dim) + + self.attn_dropout = nn.Dropout(config.attn_pdrop) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads) + index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) + + # Prune conv1d layers + self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) + self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) + + # Update hyper params + self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads)) + self.num_heads = self.num_heads - len(heads) + self.pruned_heads = self.pruned_heads.union(heads) + + def _attn(self, query, key, value, attention_mask=None, head_mask=None): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) + + if self.scale_attn_weights: + attn_weights = attn_weights / torch.full( + [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device + ) + + # Layer-wise attention scaling + if self.scale_attn_by_inverse_layer_idx: + attn_weights = attn_weights / float(self.layer_idx + 1) + + if not self.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool) + mask_value = torch.finfo(attn_weights.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device) + attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value) + + if attention_mask is not None: + # Apply the attention mask + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise + attn_weights = attn_weights.type(value.dtype) + attn_weights = self.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + + return attn_output, attn_weights + + def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None): + # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM) + bsz, num_heads, q_seq_len, dk = query.size() + _, _, k_seq_len, _ = key.size() + + # Preallocate attn_weights for `baddbmm` + attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device) + + # Compute Scale Factor + scale_factor = 1.0 + if self.scale_attn_weights: + scale_factor /= float(value.size(-1)) ** 0.5 + + if self.scale_attn_by_inverse_layer_idx: + scale_factor /= float(self.layer_idx + 1) + + # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk)) + with autocast(enabled=False): + q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len) + attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor) + attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len) + + if not self.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool() + mask_value = torch.finfo(attn_weights.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device) + attn_weights = torch.where(causal_mask, attn_weights, mask_value) + + if attention_mask is not None: + # Apply the attention mask + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise + if attn_weights.dtype != torch.float32: + raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32") + attn_weights = attn_weights.type(value.dtype) + attn_weights = self.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + + return attn_output, attn_weights + + def _split_heads(self, tensor, num_heads, attn_head_size): + """ + Splits hidden_size dim into attn_head_size and num_heads + """ + new_shape = tensor.size()[:-1] + (num_heads, attn_head_size) + tensor = tensor.view(new_shape) + return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) + + def _merge_heads(self, tensor, num_heads, attn_head_size): + """ + Merges attn_head_size dim and num_attn_heads dim into hidden_size + """ + tensor = tensor.permute(0, 2, 1, 3).contiguous() + new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,) + return tensor.view(new_shape) + + def forward( + self, + hidden_states: Optional[Tuple[torch.FloatTensor]], + layer_past: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]: + if encoder_hidden_states is not None: + if not hasattr(self, "q_attn"): + raise ValueError( + "If class is used as cross attention, the weights `q_attn` have to be defined. " + "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`." + ) + + query = self.q_attn(hidden_states) + key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2) + attention_mask = encoder_attention_mask + else: + query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2) + + query = self._split_heads(query, self.num_heads, self.head_dim) + key = self._split_heads(key, self.num_heads, self.head_dim) + value = self._split_heads(value, self.num_heads, self.head_dim) + + if layer_past is not None: + past_key, past_value = layer_past + key = torch.cat((past_key, key), dim=-2) + value = torch.cat((past_value, value), dim=-2) + + if use_cache is True: + present = (key, value) + else: + present = None + + if self.reorder_and_upcast_attn: + attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask) + else: + attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + + attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim) + attn_output = self.c_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + + outputs = (attn_output, present) + if output_attentions: + outputs += (attn_weights,) + + return outputs # a, present, (attentions) + + +class GPT2MLP(nn.Module): + def __init__(self, intermediate_size, config): + super().__init__() + embed_dim = config.hidden_size + self.c_fc = Conv1D(intermediate_size, embed_dim) + self.c_proj = Conv1D(embed_dim, intermediate_size) + self.act = ACT2FN[config.activation_function] + self.dropout = nn.Dropout(config.resid_pdrop) + + def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor: + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.c_proj(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class GPT2Block(nn.Module): + def __init__(self, config, layer_idx=None): + super().__init__() + hidden_size = config.hidden_size + inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size + + self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.attn = GPT2Attention(config, layer_idx=layer_idx) + self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + if config.add_cross_attention: + self.crossattention = GPT2Attention(config, is_cross_attention=True, layer_idx=layer_idx) + self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + self.mlp = GPT2MLP(inner_dim, config) + + def forward( + self, + hidden_states: Optional[Tuple[torch.FloatTensor]], + layer_past: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]: + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + attn_outputs = self.attn( + hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + ) + attn_output = attn_outputs[0] # output_attn: a, present, (attentions) + outputs = attn_outputs[1:] + # residual connection + hidden_states = attn_output + residual + + if encoder_hidden_states is not None: + # add one self-attention block for cross-attention + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with " + "cross-attention layers by setting `config.add_cross_attention=True`" + ) + residual = hidden_states + hidden_states = self.ln_cross_attn(hidden_states) + cross_attn_outputs = self.crossattention( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + attn_output = cross_attn_outputs[0] + # residual connection + hidden_states = residual + attn_output + outputs = outputs + cross_attn_outputs[2:] # add cross attentions if we output attention weights + + residual = hidden_states + hidden_states = self.ln_2(hidden_states) + feed_forward_hidden_states = self.mlp(hidden_states) + # residual connection + hidden_states = residual + feed_forward_hidden_states + + if use_cache: + outputs = (hidden_states,) + outputs + else: + outputs = (hidden_states,) + outputs[1:] + + return outputs # hidden_states, present, (attentions, cross_attentions) + + +class GPT2PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = GPT2Config + load_tf_weights = load_tf_weights_in_gpt2 + base_model_prefix = "transformer" + is_parallelizable = True + supports_gradient_checkpointing = True + _no_split_modules = ["GPT2Block"] + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, (nn.Linear, Conv1D)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if name == "c_proj.weight": + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, GPT2Model): + module.gradient_checkpointing = value + + +@dataclass +class GPT2DoubleHeadsModelOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided): + Multiple choice classification loss. + logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`): + Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). + past_key_values (`Tuple[Tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + GPT2Attentions weights after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + mc_loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + mc_logits: torch.FloatTensor = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +GPT2_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`GPT2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +GPT2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`): + Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see + `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have + their past given to this model should not be passed as `input_ids` as they have already been computed. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for + `past_key_values`. In other words, the `attention_mask` always has to have the length: + `len(past_key_values) + len(input_ids)` + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + + If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see + `past_key_values`). + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" +PARALLELIZE_DOCSTRING = r""" + This is an experimental feature and is a subject to change at a moment's notice. + + Uses a device map to distribute attention modules of the model across several devices. If no device map is given, + it will evenly distribute blocks across all devices. + + Args: + device_map (`Dict[int, list]`, optional, defaults to None): + A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always + automatically mapped to the first device (for esoteric reasons). That means that the first device should + have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the + following number of attention modules: + + - gpt2: 12 + - gpt2-medium: 24 + - gpt2-large: 36 + - gpt2-xl: 48 + + Example: + + ```python + # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules: + model = GPT2LMHeadModel.from_pretrained("gpt2-xl") + device_map = { + 0: [0, 1, 2, 3, 4, 5, 6, 7, 8], + 1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], + 2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34], + 3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], + } + model.parallelize(device_map) + ``` +""" +DEPARALLELIZE_DOCSTRING = r""" + Moves the model to cpu from a model parallel state. + + Example: + + ```python + # On a 4 GPU machine with gpt2-large: + model = GPT2LMHeadModel.from_pretrained("gpt2-large") + device_map = { + 0: [0, 1, 2, 3, 4, 5, 6, 7], + 1: [8, 9, 10, 11, 12, 13, 14, 15], + 2: [16, 17, 18, 19, 20, 21, 22, 23], + 3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], + } + model.parallelize(device_map) # Splits the model across several devices + model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache() + ``` +""" + + +@add_start_docstrings( + "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.", + GPT2_START_DOCSTRING, +) +class GPT2Model(GPT2PreTrainedModel): + _keys_to_ignore_on_load_missing = ["attn.masked_bias"] + + def __init__(self, config): + super().__init__(config) + + self.embed_dim = config.hidden_size + + self.wte = nn.Embedding(config.vocab_size, self.embed_dim) + self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) + + self.drop = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)]) + self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + # Check validity of device_map + self.device_map = ( + get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map + ) + assert_device_map(self.device_map, len(self.h)) + self.model_parallel = True + self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys())) + self.last_device = "cuda:" + str(max(self.device_map.keys())) + self.wte = self.wte.to(self.first_device) + self.wpe = self.wpe.to(self.first_device) + # Load onto devices + for k, v in self.device_map.items(): + for block in v: + cuda_device = "cuda:" + str(k) + self.h[block] = self.h[block].to(cuda_device) + # ln_f to last + self.ln_f = self.ln_f.to(self.last_device) + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + self.model_parallel = False + self.device_map = None + self.first_device = "cpu" + self.last_device = "cpu" + self.wte = self.wte.to("cpu") + self.wpe = self.wpe.to("cpu") + for index in range(len(self.h)): + self.h[index] = self.h[index].to("cpu") + self.ln_f = self.ln_f.to("cpu") + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.wte + + def set_input_embeddings(self, new_embeddings): + self.wte = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + for layer, heads in heads_to_prune.items(): + self.h[layer].attn.prune_heads(heads) + + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + batch_size = input_ids.shape[0] + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size = inputs_embeds.shape[0] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + if position_ids is not None: + position_ids = position_ids.view(-1, input_shape[-1]) + + if past_key_values is None: + past_length = 0 + past_key_values = tuple([None] * len(self.h)) + else: + past_length = past_key_values[0][0].size(-2) + if position_ids is None: + position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) + + # GPT2Attention mask. + if attention_mask is not None: + if batch_size <= 0: + raise ValueError("batch_size has to be defined and > 0") + attention_mask = attention_mask.view(batch_size, -1) + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = attention_mask[:, None, None, :] + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and the dtype's smallest value for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.add_cross_attention and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) + position_embeds = self.wpe(position_ids) + hidden_states = inputs_embeds + position_embeds + + if token_type_ids is not None: + token_type_embeds = self.wte(token_type_ids) + hidden_states = hidden_states + token_type_embeds + + hidden_states = self.drop(hidden_states) + + output_shape = input_shape + (hidden_states.size(-1),) + + presents = () if use_cache else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + all_hidden_states = () if output_hidden_states else None + for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): + + # Model parallel + if self.model_parallel: + torch.cuda.set_device(hidden_states.device) + # Ensure layer_past is on same device as hidden_states (might not be correct) + if layer_past is not None: + layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past) + # Ensure that attention_mask is always on the same device as hidden_states + if attention_mask is not None: + attention_mask = attention_mask.to(hidden_states.device) + if isinstance(head_mask, torch.Tensor): + head_mask = head_mask.to(hidden_states.device) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, use_cache, output_attentions) + + return custom_forward + + outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), + hidden_states, + None, + attention_mask, + head_mask[i], + encoder_hidden_states, + encoder_attention_mask, + ) + else: + outputs = block( + hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask[i], + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + hidden_states = outputs[0] + if use_cache is True: + presents = presents + (outputs[1],) + + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],) + + # Model Parallel: If it's the last layer for that device, put things on the next device + if self.model_parallel: + for k, v in self.device_map.items(): + if i == v[-1] and "cuda:" + str(k) != self.last_device: + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.ln_f(hidden_states) + + hidden_states = hidden_states.view(output_shape) + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions] + if v is not None + ) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + """ + The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + GPT2_START_DOCSTRING, +) +class GPT2LMHeadModel(GPT2PreTrainedModel): + _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.transformer = GPT2Model(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + + # Model parallel + self.model_parallel = False + self.device_map = None + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.transformer.h)) + self.transformer.parallelize(self.device_map) + self.lm_head = self.lm_head.to(self.transformer.first_device) + self.model_parallel = True + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + self.transformer.deparallelize() + self.transformer = self.transformer.to("cpu") + self.lm_head = self.lm_head.to("cpu") + self.model_parallel = False + torch.cuda.empty_cache() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): + token_type_ids = kwargs.get("token_type_ids", None) + # only last token for inputs_ids if past is defined in kwargs + if past_key_values: + input_ids = input_ids[:, -1].unsqueeze(-1) + if token_type_ids is not None: + token_type_ids = token_type_ids[:, -1].unsqueeze(-1) + + attention_mask = kwargs.get("attention_mask", None) + position_ids = kwargs.get("position_ids", None) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + else: + position_ids = None + return { + "input_ids": input_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "position_ids": position_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.transformer.first_device) + hidden_states = hidden_states.to(self.lm_head.weight.device) + + lm_logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + cross_attentions=transformer_outputs.cross_attentions, + ) + + @staticmethod + def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]: + """ + This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct + beam_idx at every generation step. + """ + return tuple( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) + for layer_past in past + ) + + +@add_start_docstrings( + """ +The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for +RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the +input embeddings, the classification head takes as input the input of a specified classification token index in the +input sequence). +""", + GPT2_START_DOCSTRING, +) +class GPT2DoubleHeadsModel(GPT2PreTrainedModel): + _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + config.num_labels = 1 + self.transformer = GPT2Model(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + self.multiple_choice_head = SequenceSummary(config) + + # Model parallel + self.model_parallel = False + self.device_map = None + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.transformer.h)) + self.transformer.parallelize(self.device_map) + self.lm_head = self.lm_head.to(self.transformer.first_device) + self.multiple_choice_head = self.multiple_choice_head.to(self.transformer.first_device) + self.model_parallel = True + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + self.transformer.deparallelize() + self.transformer = self.transformer.to("cpu") + self.lm_head = self.lm_head.to("cpu") + self.multiple_choice_head = self.multiple_choice_head.to("cpu") + self.model_parallel = False + torch.cuda.empty_cache() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): + token_type_ids = kwargs.get("token_type_ids", None) + # only last token for inputs_ids if past is defined in kwargs + if past_key_values: + input_ids = input_ids[:, -1].unsqueeze(-1) + if token_type_ids is not None: + token_type_ids = token_type_ids[:, -1].unsqueeze(-1) + + attention_mask = kwargs.get("attention_mask", None) + position_ids = kwargs.get("position_ids", None) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + else: + position_ids = None + + return { + "input_ids": input_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "position_ids": position_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + mc_token_ids: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + mc_labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, GPT2DoubleHeadsModelOutput]: + r""" + mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input): + Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) - + 1]`. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to + `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]` + mc_labels (`torch.LongTensor` of shape `(batch_size)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above) + + Return: + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, GPT2DoubleHeadsModel + + >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") + >>> model = GPT2DoubleHeadsModel.from_pretrained("gpt2") + + >>> # Add a [CLS] to the vocabulary (we should train it also!) + >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"}) + >>> # Update the model embeddings with the new vocabulary size + >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) + + >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] + >>> encoded_choices = [tokenizer.encode(s) for s in choices] + >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] + + >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2 + >>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1 + + >>> outputs = model(input_ids, mc_token_ids=mc_token_ids) + >>> lm_logits = outputs.logits + >>> mc_logits = outputs.mc_logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.transformer.first_device) + hidden_states = hidden_states.to(self.lm_head.weight.device) + + lm_logits = self.lm_head(hidden_states) + mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1) + + mc_loss = None + if mc_labels is not None: + loss_fct = CrossEntropyLoss() + mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) + lm_loss = None + if labels is not None: + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + if not return_dict: + output = (lm_logits, mc_logits) + transformer_outputs[1:] + if mc_loss is not None: + output = (mc_loss,) + output + return ((lm_loss,) + output) if lm_loss is not None else output + + return GPT2DoubleHeadsModelOutput( + loss=lm_loss, + mc_loss=mc_loss, + logits=lm_logits, + mc_logits=mc_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + @staticmethod + def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]: + """ + This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct + beam_idx at every generation step. + """ + return tuple( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) + for layer_past in past + ) + + +@add_start_docstrings( + """ + The GPT2 Model transformer with a sequence classification head on top (linear layer). + + [`GPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-1) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + GPT2_START_DOCSTRING, +) +class GPT2ForSequenceClassification(GPT2PreTrainedModel): + _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = GPT2Model(config) + self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) + + # Model parallel + self.model_parallel = False + self.device_map = None + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint="microsoft/DialogRPT-updown", + output_type=SequenceClassifierOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size, sequence_length = input_ids.shape[:2] + else: + batch_size, sequence_length = inputs_embeds.shape[:2] + + assert ( + self.config.pad_token_id is not None or batch_size == 1 + ), "Cannot handle batch sizes > 1 if no padding token is defined." + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device) + else: + sequence_lengths = -1 + logger.warning( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@add_start_docstrings( + """ + GPT2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + GPT2_START_DOCSTRING, +) +class GPT2ForTokenClassification(GPT2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.transformer = GPT2Model(config) + if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None: + classifier_dropout = config.classifier_dropout + elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Model parallel + self.model_parallel = False + self.device_map = None + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + # fmt: off + @add_code_sample_docstrings( + checkpoint="brad1141/gpt2-finetuned-comp2", + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_loss=0.25, + expected_output=["Lead", "Lead", "Lead", "Position", "Lead", "Lead", "Lead", "Lead", "Lead", "Lead", "Lead", "Lead"], + ) + # fmt: on + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + transformer_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) From 5145458c28d27411afb44b60dd34614dc316ef20 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 7 Feb 2023 10:05:07 -0500 Subject: [PATCH 02/10] Changes --- src/transformers/__init__.py | 24 ++++ src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + src/transformers/models/auto/modeling_auto.py | 6 + .../models/auto/tokenization_auto.py | 1 + .../models/gpt_bigcode/__init__.py | 72 +++++++++++ .../gpt_bigcode/configuration_gpt_bigcode.py | 46 ++++--- .../gpt_bigcode/modeling_gpt_bigcode.py | 122 +++++++++--------- 8 files changed, 188 insertions(+), 87 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index b65781ab75..6d8be0ae26 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -274,6 +274,7 @@ "models.git": ["GIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "GitConfig", "GitProcessor", "GitVisionConfig"], "models.glpn": ["GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP", "GLPNConfig"], "models.gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2Tokenizer"], + "models.gpt_bigcode": ["GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTBigCodeConfig", "GPTBigCodeTokenizer"], "models.gpt_neo": ["GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoConfig"], "models.gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"], "models.gpt_neox_japanese": ["GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXJapaneseConfig"], @@ -1531,6 +1532,18 @@ "load_tf_weights_in_gpt2", ] ) + _import_structure["models.gpt_bigcode"].extend( + [ + "GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST", + "GPTBigCodeDoubleHeadsModel", + "GPTBigCodeForSequenceClassification", + "GPTBigCodeForTokenClassification", + "GPTBigCodeLMHeadModel", + "GPTBigCodeModel", + "GPTBigCodePreTrainedModel", + "load_tf_weights_in_gpt_bigcode", + ] + ) _import_structure["models.gpt_neo"].extend( [ "GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -3714,6 +3727,7 @@ from .models.git import GIT_PRETRAINED_CONFIG_ARCHIVE_MAP, GitConfig, GitProcessor, GitVisionConfig from .models.glpn import GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP, GLPNConfig from .models.gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2Tokenizer + from .models.gpt_bigcode import GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTBigCodeConfig from .models.gpt_neo import GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoConfig from .models.gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig from .models.gpt_neox_japanese import GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXJapaneseConfig @@ -4781,6 +4795,16 @@ GPT2PreTrainedModel, load_tf_weights_in_gpt2, ) + from .models.gpt_bigcode import ( + GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST, + GPTBigCodeDoubleHeadsModel, + GPTBigCodeForSequenceClassification, + GPTBigCodeForTokenClassification, + GPTBigCodeLMHeadModel, + GPTBigCodeModel, + GPTBigCodePreTrainedModel, + load_tf_weights_in_gpt_bigcode, + ) from .models.gpt_neo import ( GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST, GPTNeoForCausalLM, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 9eade475fa..820f90bc36 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -79,6 +79,7 @@ git, glpn, gpt2, + gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 1a77eb0153..70b777f414 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -84,6 +84,7 @@ ("glpn", "GLPNConfig"), ("gpt-sw3", "GPT2Config"), ("gpt2", "GPT2Config"), + ("gpt_bigcode", "GPTBigCodeConfig"), ("gpt_neo", "GPTNeoConfig"), ("gpt_neox", "GPTNeoXConfig"), ("gpt_neox_japanese", "GPTNeoXJapaneseConfig"), @@ -248,6 +249,7 @@ ("git", "GIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("glpn", "GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("gpt2", "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("gpt_bigcode", "GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("gpt_neo", "GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("gpt_neox", "GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("gpt_neox_japanese", "GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -412,6 +414,7 @@ ("glpn", "GLPN"), ("gpt-sw3", "GPT-Sw3"), ("gpt2", "OpenAI GPT-2"), + ("gpt_bigcode", "GPT BigCode"), ("gpt_neo", "GPT Neo"), ("gpt_neox", "GPT NeoX"), ("gpt_neox_japanese", "GPT NeoX Japanese"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 2fe18122aa..6c5c18fe06 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -83,6 +83,7 @@ ("glpn", "GLPNModel"), ("gpt-sw3", "GPT2Model"), ("gpt2", "GPT2Model"), + ("gpt_bigcode", "GPTBigCodeModel"), ("gpt_neo", "GPTNeoModel"), ("gpt_neox", "GPTNeoXModel"), ("gpt_neox_japanese", "GPTNeoXJapaneseModel"), @@ -210,6 +211,7 @@ ("funnel", "FunnelForPreTraining"), ("gpt-sw3", "GPT2LMHeadModel"), ("gpt2", "GPT2LMHeadModel"), + ("gpt_bigcode", "GPTBigCodeLMHeadModel"), ("ibert", "IBertForMaskedLM"), ("layoutlm", "LayoutLMForMaskedLM"), ("longformer", "LongformerForMaskedLM"), @@ -274,6 +276,7 @@ ("git", "GitForCausalLM"), ("gpt-sw3", "GPT2LMHeadModel"), ("gpt2", "GPT2LMHeadModel"), + ("gpt_bigcode", "GPTBigCodeLMHeadModel"), ("gpt_neo", "GPTNeoForCausalLM"), ("gpt_neox", "GPTNeoXForCausalLM"), ("gpt_neox_japanese", "GPTNeoXJapaneseForCausalLM"), @@ -340,6 +343,7 @@ ("git", "GitForCausalLM"), ("gpt-sw3", "GPT2LMHeadModel"), ("gpt2", "GPT2LMHeadModel"), + ("gpt_bigcode", "GPTBigCodeLMHeadModel"), ("gpt_neo", "GPTNeoForCausalLM"), ("gpt_neox", "GPTNeoXForCausalLM"), ("gpt_neox_japanese", "GPTNeoXJapaneseForCausalLM"), @@ -618,6 +622,7 @@ ("funnel", "FunnelForSequenceClassification"), ("gpt-sw3", "GPT2ForSequenceClassification"), ("gpt2", "GPT2ForSequenceClassification"), + ("gpt_bigcode", "GPTBigCodeForSequenceClassification"), ("gpt_neo", "GPTNeoForSequenceClassification"), ("gptj", "GPTJForSequenceClassification"), ("ibert", "IBertForSequenceClassification"), @@ -757,6 +762,7 @@ ("funnel", "FunnelForTokenClassification"), ("gpt-sw3", "GPT2ForTokenClassification"), ("gpt2", "GPT2ForTokenClassification"), + ("gpt_bigcode", "GPTBigCodeForTokenClassification"), ("ibert", "IBertForTokenClassification"), ("layoutlm", "LayoutLMForTokenClassification"), ("layoutlmv2", "LayoutLMv2ForTokenClassification"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 7073221d74..3a26d54d16 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -141,6 +141,7 @@ ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)), ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)), diff --git a/src/transformers/models/gpt_bigcode/__init__.py b/src/transformers/models/gpt_bigcode/__init__.py index e69de29bb2..5b585a8908 100644 --- a/src/transformers/models/gpt_bigcode/__init__.py +++ b/src/transformers/models/gpt_bigcode/__init__.py @@ -0,0 +1,72 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_torch_available, +) + + +_import_structure = { + "configuration_gpt_bigcode": ["GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTBigCodeConfig", "GPTBigCodeOnnxConfig"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_gpt_bigcode"] = [ + "GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST", + "GPTBigCodeDoubleHeadsModel", + "GPTBigCodeForSequenceClassification", + "GPTBigCodeForTokenClassification", + "GPTBigCodeLMHeadModel", + "GPTBigCodeModel", + "GPTBigCodePreTrainedModel", + "load_tf_weights_in_gpt_bigcode", + ] + +if TYPE_CHECKING: + from .configuration_gpt_bigcode import GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTBigCodeConfig, GPTBigCodeOnnxConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_gpt_bigcode import ( + GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST, + GPTBigCodeDoubleHeadsModel, + GPTBigCodeForSequenceClassification, + GPTBigCodeForTokenClassification, + GPTBigCodeLMHeadModel, + GPTBigCodeModel, + GPTBigCodePreTrainedModel, + load_tf_weights_in_gpt_bigcode, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py index fe9c711d73..8fcf554ded 100644 --- a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py @@ -26,18 +26,15 @@ logger = logging.get_logger(__name__) -GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json", - "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json", - "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json", - "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json", - "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json", +GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP = { + # TODO: Add support for santa models. } -class GPT2Config(PretrainedConfig): +class GPTBigCodeConfig(PretrainedConfig): """ - This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to + # TODO: Update doc + This is the configuration class to store the configuration of a [`GPTBigCodeModel`] or a [`TFGPTBigCodeModel`]. It is used to instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the GPT-2 [gpt2](https://huggingface.co/gpt2) architecture. @@ -49,7 +46,7 @@ class GPT2Config(PretrainedConfig): Args: vocab_size (`int`, *optional*, defaults to 50257): Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`]. + `inputs_ids` passed when calling [`GPTBigCodeModel`] or [`TFGPTBigCodeModel`]. n_positions (`int`, *optional*, defaults to 1024): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). @@ -74,8 +71,8 @@ class GPT2Config(PretrainedConfig): initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. summary_type (`string`, *optional*, defaults to `"cls_index"`): - Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and - [`TFGPT2DoubleHeadsModel`]. + Argument used when doing sequence summary, used in the models [`GPTBigCodeDoubleHeadsModel`] and + [`TFGPTBigCodeDoubleHeadsModel`]. Has to be one of the following options: @@ -85,23 +82,23 @@ class GPT2Config(PretrainedConfig): - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2). - `"attn"`: Not implemented now, use multi-head attention. summary_use_proj (`bool`, *optional*, defaults to `True`): - Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and - [`TFGPT2DoubleHeadsModel`]. + Argument used when doing sequence summary, used in the models [`GPTBigCodeDoubleHeadsModel`] and + [`TFGPTBigCodeDoubleHeadsModel`]. Whether or not to add a projection after the vector extraction. summary_activation (`str`, *optional*): Argument used when doing sequence summary. Used in for the multiple choice head in - [`GPT2DoubleHeadsModel`]. + [`GPTBigCodeDoubleHeadsModel`]. Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation. summary_proj_to_labels (`bool`, *optional*, defaults to `True`): - Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and - [`TFGPT2DoubleHeadsModel`]. + Argument used when doing sequence summary, used in the models [`GPTBigCodeDoubleHeadsModel`] and + [`TFGPTBigCodeDoubleHeadsModel`]. Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes. summary_first_dropout (`float`, *optional*, defaults to 0.1): - Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and - [`TFGPT2DoubleHeadsModel`]. + Argument used when doing sequence summary, used in the models [`GPTBigCodeDoubleHeadsModel`] and + [`TFGPTBigCodeDoubleHeadsModel`]. The dropout ratio to be used after the projection and activation. scale_attn_weights (`bool`, *optional*, defaults to `True`): @@ -117,19 +114,19 @@ class GPT2Config(PretrainedConfig): Example: ```python - >>> from transformers import GPT2Config, GPT2Model + >>> from transformers import GPTBigCodeConfig, GPTBigCodeModel - >>> # Initializing a GPT2 configuration - >>> configuration = GPT2Config() + >>> # Initializing a GPTBigCode configuration + >>> configuration = GPTBigCodeConfig() >>> # Initializing a model (with random weights) from the configuration - >>> model = GPT2Model(configuration) + >>> model = GPTBigCodeModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config ```""" - model_type = "gpt2" + model_type = "gpt_bigcode" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = { "hidden_size": "n_embd", @@ -193,7 +190,8 @@ def __init__( super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) -class GPT2OnnxConfig(OnnxConfigWithPast): +class GPTBigCodeOnnxConfig(OnnxConfigWithPast): + # TODO: Onnx support? def __init__( self, config: PretrainedConfig, diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index 5fe33bbca5..8625e77759 100644 --- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -44,26 +44,22 @@ replace_return_docstrings, ) from ...utils.model_parallel_utils import assert_device_map, get_device_map -from .configuration_gpt2 import GPT2Config +from .configuration_gpt_bigcode import GPTBigCodeConfig logger = logging.get_logger(__name__) -_CHECKPOINT_FOR_DOC = "gpt2" -_CONFIG_FOR_DOC = "GPT2Config" +_CHECKPOINT_FOR_DOC = "gpt_bigcode" +_CONFIG_FOR_DOC = "GPTBigCodeConfig" -GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "gpt2", - "gpt2-medium", - "gpt2-large", - "gpt2-xl", - "distilgpt2", - # See all GPT-2 models at https://huggingface.co/models?filter=gpt2 +GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST = [ + # TODO: Add support for santa models. ] -def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): +def load_tf_weights_in_gpt_bigcode(model, config, gpt_bigcode_checkpoint_path): """Load tf checkpoints in a pytorch model""" + # TODO: Update this. try: import re @@ -74,7 +70,7 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): "https://www.tensorflow.org/install/ for installation instructions." ) raise - tf_path = os.path.abspath(gpt2_checkpoint_path) + tf_path = os.path.abspath(gpt_bigcode_checkpoint_path) logger.info(f"Converting TensorFlow checkpoint from {tf_path}") # Load weights from TF model init_vars = tf.train.list_variables(tf_path) @@ -119,7 +115,7 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): return model -class GPT2Attention(nn.Module): +class GPTBigCodeAttention(nn.Module): def __init__(self, config, is_cross_attention=False, layer_idx=None): super().__init__() @@ -300,7 +296,7 @@ def forward( if not hasattr(self, "q_attn"): raise ValueError( "If class is used as cross attention, the weights `q_attn` have to be defined. " - "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`." + "Please make sure to instantiate class with `GPTBigCodeAttention(..., is_cross_attention=True)`." ) query = self.q_attn(hidden_states) @@ -339,7 +335,7 @@ def forward( return outputs # a, present, (attentions) -class GPT2MLP(nn.Module): +class GPTBigCodeMLP(nn.Module): def __init__(self, intermediate_size, config): super().__init__() embed_dim = config.hidden_size @@ -356,21 +352,21 @@ def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.Fl return hidden_states -class GPT2Block(nn.Module): +class GPTBigCodeBlock(nn.Module): def __init__(self, config, layer_idx=None): super().__init__() hidden_size = config.hidden_size inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.attn = GPT2Attention(config, layer_idx=layer_idx) + self.attn = GPTBigCodeAttention(config, layer_idx=layer_idx) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) if config.add_cross_attention: - self.crossattention = GPT2Attention(config, is_cross_attention=True, layer_idx=layer_idx) + self.crossattention = GPTBigCodeAttention(config, is_cross_attention=True, layer_idx=layer_idx) self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.mlp = GPT2MLP(inner_dim, config) + self.mlp = GPTBigCodeMLP(inner_dim, config) def forward( self, @@ -434,18 +430,18 @@ def forward( return outputs # hidden_states, present, (attentions, cross_attentions) -class GPT2PreTrainedModel(PreTrainedModel): +class GPTBigCodePreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ - config_class = GPT2Config - load_tf_weights = load_tf_weights_in_gpt2 + config_class = GPTBigCodeConfig + load_tf_weights = load_tf_weights_in_gpt_bigcode base_model_prefix = "transformer" is_parallelizable = True supports_gradient_checkpointing = True - _no_split_modules = ["GPT2Block"] + _no_split_modules = ["GPTBigCodeBlock"] def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) @@ -478,12 +474,12 @@ def _init_weights(self, module): p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))) def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, GPT2Model): + if isinstance(module, GPTBigCodeModel): module.gradient_checkpointing = value @dataclass -class GPT2DoubleHeadsModelOutput(ModelOutput): +class GPTBigCodeDoubleHeadsModelOutput(ModelOutput): """ Base class for outputs of models predicting if two sentences are consecutive or not. @@ -511,7 +507,7 @@ class GPT2DoubleHeadsModelOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. - GPT2Attentions weights after the attention softmax, used to compute the weighted average in the + GPTBigCodeAttentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ @@ -524,7 +520,7 @@ class GPT2DoubleHeadsModelOutput(ModelOutput): attentions: Optional[Tuple[torch.FloatTensor]] = None -GPT2_START_DOCSTRING = r""" +GPTBigCode_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads @@ -535,12 +531,12 @@ class GPT2DoubleHeadsModelOutput(ModelOutput): and behavior. Parameters: - config ([`GPT2Config`]): Model configuration class with all the parameters of the model. + config ([`GPTBigCodeConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ -GPT2_INPUTS_DOCSTRING = r""" +GPTBigCode_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): `input_ids_length` = `sequence_length` if `past_key_values` is `None` else @@ -629,7 +625,7 @@ class GPT2DoubleHeadsModelOutput(ModelOutput): ```python # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules: - model = GPT2LMHeadModel.from_pretrained("gpt2-xl") + model = GPTBigCodeLMHeadModel.from_pretrained("gpt2-xl") device_map = { 0: [0, 1, 2, 3, 4, 5, 6, 7, 8], 1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], @@ -646,7 +642,7 @@ class GPT2DoubleHeadsModelOutput(ModelOutput): ```python # On a 4 GPU machine with gpt2-large: - model = GPT2LMHeadModel.from_pretrained("gpt2-large") + model = GPTBigCodeLMHeadModel.from_pretrained("gpt2-large") device_map = { 0: [0, 1, 2, 3, 4, 5, 6, 7], 1: [8, 9, 10, 11, 12, 13, 14, 15], @@ -660,10 +656,10 @@ class GPT2DoubleHeadsModelOutput(ModelOutput): @add_start_docstrings( - "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.", - GPT2_START_DOCSTRING, + "The bare GPTBigCode Model transformer outputting raw hidden-states without any specific head on top.", + GPTBigCode_START_DOCSTRING, ) -class GPT2Model(GPT2PreTrainedModel): +class GPTBigCodeModel(GPTBigCodePreTrainedModel): _keys_to_ignore_on_load_missing = ["attn.masked_bias"] def __init__(self, config): @@ -675,7 +671,7 @@ def __init__(self, config): self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.drop = nn.Dropout(config.embd_pdrop) - self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)]) + self.h = nn.ModuleList([GPTBigCodeBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)]) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) # Model parallel @@ -732,7 +728,7 @@ def _prune_heads(self, heads_to_prune): for layer, heads in heads_to_prune.items(): self.h[layer].attn.prune_heads(heads) - @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(GPTBigCode_INPUTS_DOCSTRING) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, output_type=BaseModelOutputWithPastAndCrossAttentions, @@ -789,7 +785,7 @@ def forward( position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) - # GPT2Attention mask. + # GPTBigCodeAttention mask. if attention_mask is not None: if batch_size <= 0: raise ValueError("batch_size has to be defined and > 0") @@ -935,17 +931,17 @@ def custom_forward(*inputs): @add_start_docstrings( """ - The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input + The GPTBigCode Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, - GPT2_START_DOCSTRING, + GPTBigCode_START_DOCSTRING, ) -class GPT2LMHeadModel(GPT2PreTrainedModel): +class GPTBigCodeLMHeadModel(GPTBigCodePreTrainedModel): _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] def __init__(self, config): super().__init__(config) - self.transformer = GPT2Model(config) + self.transformer = GPTBigCodeModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) # Model parallel @@ -1009,7 +1005,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg "token_type_ids": token_type_ids, } - @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(GPTBigCode_INPUTS_DOCSTRING) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, output_type=CausalLMOutputWithCrossAttentions, @@ -1101,20 +1097,20 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> @add_start_docstrings( """ -The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for +The GPTBigCode Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). """, - GPT2_START_DOCSTRING, + GPTBigCode_START_DOCSTRING, ) -class GPT2DoubleHeadsModel(GPT2PreTrainedModel): +class GPTBigCodeDoubleHeadsModel(GPTBigCodePreTrainedModel): _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] def __init__(self, config): super().__init__(config) config.num_labels = 1 - self.transformer = GPT2Model(config) + self.transformer = GPTBigCodeModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) @@ -1182,8 +1178,8 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg "token_type_ids": token_type_ids, } - @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC) + @add_start_docstrings_to_model_forward(GPTBigCode_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=GPTBigCodeDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -1201,7 +1197,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple, GPT2DoubleHeadsModelOutput]: + ) -> Union[Tuple, GPTBigCodeDoubleHeadsModelOutput]: r""" mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input): Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) - @@ -1220,10 +1216,10 @@ def forward( ```python >>> import torch - >>> from transformers import AutoTokenizer, GPT2DoubleHeadsModel + >>> from transformers import AutoTokenizer, GPTBigCodeDoubleHeadsModel >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> model = GPT2DoubleHeadsModel.from_pretrained("gpt2") + >>> model = GPTBigCodeDoubleHeadsModel.from_pretrained("gpt2") >>> # Add a [CLS] to the vocabulary (we should train it also!) >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"}) @@ -1284,7 +1280,7 @@ def forward( output = (mc_loss,) + output return ((lm_loss,) + output) if lm_loss is not None else output - return GPT2DoubleHeadsModelOutput( + return GPTBigCodeDoubleHeadsModelOutput( loss=lm_loss, mc_loss=mc_loss, logits=lm_logits, @@ -1309,9 +1305,9 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> @add_start_docstrings( """ - The GPT2 Model transformer with a sequence classification head on top (linear layer). + The GPTBigCode Model transformer with a sequence classification head on top (linear layer). - [`GPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + [`GPTBigCodeForSequenceClassification`] uses the last token in order to do the classification, as other causal models (e.g. GPT-1) do. Since it does classification on the last token, it requires to know the position of the last token. If a @@ -1320,15 +1316,15 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in each row of the batch). """, - GPT2_START_DOCSTRING, + GPTBigCode_START_DOCSTRING, ) -class GPT2ForSequenceClassification(GPT2PreTrainedModel): +class GPTBigCodeForSequenceClassification(GPTBigCodePreTrainedModel): _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels - self.transformer = GPT2Model(config) + self.transformer = GPTBigCodeModel(config) self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) # Model parallel @@ -1338,7 +1334,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(GPTBigCode_INPUTS_DOCSTRING) @add_code_sample_docstrings( checkpoint="microsoft/DialogRPT-updown", output_type=SequenceClassifierOutputWithPast, @@ -1442,17 +1438,17 @@ def forward( @add_start_docstrings( """ - GPT2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + GPTBigCode Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - GPT2_START_DOCSTRING, + GPTBigCode_START_DOCSTRING, ) -class GPT2ForTokenClassification(GPT2PreTrainedModel): +class GPTBigCodeForTokenClassification(GPTBigCodePreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels - self.transformer = GPT2Model(config) + self.transformer = GPTBigCodeModel(config) if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None: classifier_dropout = config.classifier_dropout elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: @@ -1469,7 +1465,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(GPTBigCode_INPUTS_DOCSTRING) # fmt: off @add_code_sample_docstrings( checkpoint="brad1141/gpt2-finetuned-comp2", From d5dd3072c7114c3835c66f555df94ab3e1e41e68 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 7 Feb 2023 11:31:52 -0500 Subject: [PATCH 03/10] tests --- __init__.py | 0 tests/models/gpt_bigcode/__init__.py | 0 .../gpt_bigcode/test_modeling_gpt_bigcode.py | 801 ++++++++++++++++++ 3 files changed, 801 insertions(+) create mode 100644 __init__.py create mode 100644 tests/models/gpt_bigcode/__init__.py create mode 100644 tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/gpt_bigcode/__init__.py b/tests/models/gpt_bigcode/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py new file mode 100644 index 0000000000..52b4669896 --- /dev/null +++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py @@ -0,0 +1,801 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import datetime +import math +import unittest + +from transformers import GPTBigCodeConfig, is_torch_available +from transformers.testing_utils import require_torch, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + + +if is_torch_available(): + import torch + + from transformers import ( + GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST, + GPTBigCodeDoubleHeadsModel, + GPTBigCodeForSequenceClassification, + GPTBigCodeForTokenClassification, + GPTBigCodeLMHeadModel, + GPTBigCodeModel, + GPTBigCodeTokenizer, + ) + + +class GPTBigCodeModelTester: + # TODO: Update the tests to use valid pretrained models. + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.use_mc_token_ids = use_mc_token_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = None + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + + def get_large_model_config(self): + return GPTBigCodeConfig.from_pretrained("gpt2") + + def prepare_config_and_inputs( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + mc_token_ids = None + if self.use_mc_token_ids: + mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config( + gradient_checkpointing=gradient_checkpointing, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + reorder_and_upcast_attn=reorder_and_upcast_attn, + ) + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) + + def get_config( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + return GPTBigCodeConfig( + vocab_size=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + n_inner=self.intermediate_size, + activation_function=self.hidden_act, + resid_pdrop=self.hidden_dropout_prob, + attn_pdrop=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + use_cache=True, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + gradient_checkpointing=gradient_checkpointing, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + reorder_and_upcast_attn=reorder_and_upcast_attn, + ) + + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_gpt_bigcode_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPTBigCodeModel(config=config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(len(result.past_key_values), config.n_layer) + + def create_and_check_gpt_bigcode_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPTBigCodeModel(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True) + outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids) + outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False) + + self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) + self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) + + output, past = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size) + + # append to next input_ids and token_type_ids + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) + + output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] + output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_gpt_bigcode_model_attention_mask_past( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = GPTBigCodeModel(config=config) + model.to(torch_device) + model.eval() + + # create attention mask + attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) + half_seq_length = self.seq_length // 2 + attn_mask[:, half_seq_length:] = 0 + + # first forward pass + output, past = model(input_ids, attention_mask=attn_mask).to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + + # change a random masked slice from input_ids + random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1 + random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1) + input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens + + # append to next input_ids and attn_mask + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + attn_mask = torch.cat( + [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], + dim=1, + ) + + # get two different outputs + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_gpt_bigcode_model_past_large_inputs( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = GPTBigCodeModel(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True) + + output, past = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and token_type_ids + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model( + next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask + )["last_hidden_state"] + output_from_past = model( + next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past + )["last_hidden_state"] + self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1]) + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPTBigCodeLMHeadModel(config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_forward_and_backwards( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False + ): + model = GPTBigCodeLMHeadModel(config) + model.to(torch_device) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + result.loss.backward() + + def create_and_check_double_lm_head_model( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args + ): + model = GPTBigCodeDoubleHeadsModel(config) + model.to(torch_device) + model.eval() + + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + + inputs = { + "input_ids": multiple_choice_inputs_ids, + "mc_token_ids": mc_token_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + "labels": multiple_choice_inputs_ids, + } + + result = model(**inputs) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size) + ) + self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices)) + + def create_and_check_gpt_bigcode_for_sequence_classification( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + model = GPTBigCodeForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_gpt_bigcode_for_token_classification( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + model = GPTBigCodeForTokenClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_gpt_bigcode_weight_initialization(self, config, *args): + model = GPTBigCodeModel(config) + model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer) + for key in model.state_dict().keys(): + if "c_proj" in key and "weight" in key: + self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001) + self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "head_mask": head_mask, + } + + return config, inputs_dict + + +@require_torch +class GPTBigCodeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + # TODO: Update the tests to use valid pretrained models. + + all_model_classes = ( + (GPTBigCodeModel, GPTBigCodeLMHeadModel, GPTBigCodeDoubleHeadsModel, GPTBigCodeForSequenceClassification, GPTBigCodeForTokenClassification) + if is_torch_available() + else () + ) + all_generative_model_classes = (GPTBigCodeLMHeadModel, GPTBigCodeDoubleHeadsModel) if is_torch_available() else () + all_parallelizable_model_classes = (GPTBigCodeLMHeadModel, GPTBigCodeDoubleHeadsModel) if is_torch_available() else () + fx_compatible = True + test_missing_keys = False + test_model_parallel = True + + # special case for DoubleHeads model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ == "GPTBigCodeDoubleHeadsModel": + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.num_choices, self.model_tester.seq_length), + dtype=torch.long, + device=torch_device, + ) + inputs_dict["input_ids"] = inputs_dict["labels"] + inputs_dict["token_type_ids"] = inputs_dict["labels"] + inputs_dict["mc_token_ids"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.num_choices), + dtype=torch.long, + device=torch_device, + ) + inputs_dict["mc_labels"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + return inputs_dict + + def setUp(self): + self.model_tester = GPTBigCodeModelTester(self) + self.config_tester = ConfigTester(self, config_class=GPTBigCodeConfig, n_embd=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_gpt_bigcode_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_model(*config_and_inputs) + + def test_gpt_bigcode_model_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_model_past(*config_and_inputs) + + def test_gpt_bigcode_model_att_mask_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_model_attention_mask_past(*config_and_inputs) + + def test_gpt_bigcode_model_past_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_model_past_large_inputs(*config_and_inputs) + + def test_gpt_bigcode_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + + def test_gpt_bigcode_double_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs) + + def test_gpt_bigcode_sequence_classification_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_for_sequence_classification(*config_and_inputs) + + def test_gpt_bigcode_token_classification_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_for_token_classification(*config_and_inputs) + + def test_gpt_bigcode_gradient_checkpointing(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True) + + def test_gpt_bigcode_scale_attn_by_inverse_layer_idx(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs(scale_attn_by_inverse_layer_idx=True) + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs) + + def test_gpt_bigcode_reorder_and_upcast_attn(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs(reorder_and_upcast_attn=True) + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs) + + def test_gpt_bigcode_weight_initialization(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_weight_initialization(*config_and_inputs) + + @slow + def test_batch_generation(self): + model = GPTBigCodeLMHeadModel.from_pretrained("gpt2") + model.to(torch_device) + tokenizer = GPTBigCodeTokenizer.from_pretrained("gpt2") + + tokenizer.padding_side = "left" + + # Define PAD Token = EOS Token = 50256 + tokenizer.pad_token = tokenizer.eos_token + model.config.pad_token_id = model.config.eos_token_id + + # use different length sentences to test batching + sentences = [ + "Hello, my dog is a little", + "Today, I", + ] + + inputs = tokenizer(sentences, return_tensors="pt", padding=True) + input_ids = inputs["input_ids"].to(torch_device) + token_type_ids = torch.cat( + [ + input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0), + input_ids.new_full((input_ids.shape[0], 1), 500), + ], + dim=-1, + ) + + outputs = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + ) + + outputs_tt = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + token_type_ids=token_type_ids, + ) + + inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device) + output_non_padded = model.generate(input_ids=inputs_non_padded) + + num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item() + inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device) + output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) + + batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) + batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True) + non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) + padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) + + expected_output_sentence = [ + "Hello, my dog is a little bit of a mess. I'm not sure if he's going", + "Today, I'm going to be doing a lot of research on this. I", + ] + self.assertListEqual(expected_output_sentence, batch_out_sentence) + self.assertTrue(batch_out_sentence_tt != batch_out_sentence) # token_type_ids should change output + self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) + + @slow + def test_batch_generation_2heads(self): + model = GPTBigCodeDoubleHeadsModel.from_pretrained("gpt2") + model.to(torch_device) + tokenizer = GPTBigCodeTokenizer.from_pretrained("gpt2") + + tokenizer.padding_side = "left" + + # This tokenizer has no pad token, so we have to set it in some way + # Define PAD Token = EOS Token = 50256 + tokenizer.pad_token = tokenizer.eos_token + model.config.pad_token_id = model.config.eos_token_id + + # use different length sentences to test batching + sentences = [ + "Hello, my dog is a little", + "Today, I", + ] + + inputs = tokenizer(sentences, return_tensors="pt", padding=True) + input_ids = inputs["input_ids"].to(torch_device) + token_type_ids = torch.cat( + [ + input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0), + input_ids.new_full((input_ids.shape[0], 1), 500), + ], + dim=-1, + ) + + outputs = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + ) + + outputs_tt = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + token_type_ids=token_type_ids, + ) + + inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device) + output_non_padded = model.generate(input_ids=inputs_non_padded) + + num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item() + inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device) + output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) + + batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) + batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True) + non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) + padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) + + expected_output_sentence = [ + "Hello, my dog is a little bit of a mess. I'm not sure if he's going", + "Today, I'm going to be doing a lot of research on this. I", + ] + self.assertListEqual(expected_output_sentence, batch_out_sentence) + self.assertTrue(batch_out_sentence_tt != batch_out_sentence) # token_type_ids should change output + self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) + + @slow + def test_model_from_pretrained(self): + for model_name in GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = GPTBigCodeModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_torch +class GPTBigCodeModelLanguageGenerationTest(unittest.TestCase): + def _test_lm_generate_gpt_bigcode_helper( + self, + gradient_checkpointing=False, + reorder_and_upcast_attn=False, + scale_attn_by_inverse_layer_idx=False, + verify_outputs=True, + ): + model = GPTBigCodeLMHeadModel.from_pretrained( + "gpt2", + reorder_and_upcast_attn=reorder_and_upcast_attn, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + ) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + else: + model.gradient_checkpointing_disable() + model.to(torch_device) + + # The dog + input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device) + + # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog + # fmt: off + expected_output_ids = [ + 464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290, + ] + # fmt: on + output_ids = model.generate(input_ids, do_sample=False) + if verify_outputs: + self.assertListEqual(output_ids[0].tolist(), expected_output_ids) + + @slow + def test_lm_generate_gpt_bigcode(self): + self._test_lm_generate_gpt_bigcode_helper() + + @slow + def test_lm_generate_gpt_bigcode_with_gradient_checkpointing(self): + self._test_lm_generate_gpt_bigcode_helper(gradient_checkpointing=True) + + @slow + def test_lm_generate_gpt_bigcode_with_reorder_and_upcast_attn(self): + self._test_lm_generate_gpt_bigcode_helper(reorder_and_upcast_attn=True) + + @slow + def test_lm_generate_gpt_bigcode_with_scale_attn_by_inverse_layer_idx(self): + self._test_lm_generate_gpt_bigcode_helper(scale_attn_by_inverse_layer_idx=True, verify_outputs=False) + + @slow + def test_gpt_bigcode_sample(self): + tokenizer = GPTBigCodeTokenizer.from_pretrained("gpt2") + model = GPTBigCodeLMHeadModel.from_pretrained("gpt2") + model.to(torch_device) + + torch.manual_seed(0) + tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True) + input_ids = tokenized.input_ids.to(torch_device) + output_ids = model.generate(input_ids, do_sample=True) + output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True) + + token_type_ids = tokenized.token_type_ids.to(torch_device) + output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5) + output_seq_tt = model.generate( + input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5 + ) + output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True) + output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True) + + EXPECTED_OUTPUT_STR = ( + "Today is a nice day and if you don't know anything about the state of play during your holiday" + ) + self.assertEqual(output_str, EXPECTED_OUTPUT_STR) + self.assertTrue( + all([output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))]) + ) # token_type_ids should change output + + @slow + def test_gpt_bigcode_sample_max_time(self): + tokenizer = GPTBigCodeTokenizer.from_pretrained("gpt2") + model = GPTBigCodeLMHeadModel.from_pretrained("gpt2") + model.to(torch_device) + + torch.manual_seed(0) + tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True) + input_ids = tokenized.input_ids.to(torch_device) + + MAX_TIME = 0.5 + + start = datetime.datetime.now() + model.generate(input_ids, do_sample=True, max_time=MAX_TIME, max_length=256) + duration = datetime.datetime.now() - start + self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME)) + self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME)) + + start = datetime.datetime.now() + model.generate(input_ids, do_sample=False, max_time=MAX_TIME, max_length=256) + duration = datetime.datetime.now() - start + self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME)) + self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME)) + + start = datetime.datetime.now() + model.generate(input_ids, do_sample=False, num_beams=2, max_time=MAX_TIME, max_length=256) + duration = datetime.datetime.now() - start + self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME)) + self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME)) + + start = datetime.datetime.now() + model.generate(input_ids, do_sample=True, num_beams=2, max_time=MAX_TIME, max_length=256) + duration = datetime.datetime.now() - start + self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME)) + self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME)) + + start = datetime.datetime.now() + model.generate(input_ids, do_sample=False, max_time=None, max_length=256) + duration = datetime.datetime.now() - start + self.assertGreater(duration, datetime.timedelta(seconds=1.5 * MAX_TIME)) + + @slow + def test_contrastive_search_gpt_bigcode(self): + article = ( + "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research " + "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based" + ) + + gpt_bigcode_tokenizer = GPTBigCodeTokenizer.from_pretrained("gpt2-large") + gpt_bigcode_model = GPTBigCodeLMHeadModel.from_pretrained("gpt2-large").to(torch_device) + input_ids = gpt_bigcode_tokenizer(article, return_tensors="pt").input_ids.to(torch_device) + + outputs = gpt_bigcode_model.generate(input_ids, penalty_alpha=0.6, top_k=4, max_length=256) + + generated_text = gpt_bigcode_tokenizer.batch_decode(outputs, skip_special_tokens=True) + + self.assertListEqual( + generated_text, + [ + "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research " + "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based in London, " + "United Kingdom\n\nGoogle has a lot of data on its users and uses it to improve its products, such as " + "Google Now, which helps users find the information they're looking for on the web. But the company " + "is not the only one to collect data on its users. Facebook, for example, has its own facial " + "recognition technology, as well as a database of millions of photos that it uses to personalize its " + "News Feed.\n\nFacebook's use of data is a hot topic in the tech industry, with privacy advocates " + "concerned about the company's ability to keep users' information private. In a blog post last " + 'year, Facebook CEO Mark Zuckerberg said his company would "do our best to be transparent about our ' + 'data use and how we use it."\n\n"We have made it clear that we do not sell or share your data with ' + 'third parties," Zuckerberg wrote. "If you have questions or concerns, please reach out to us at ' + 'privacy@facebook.com."\n\nGoogle declined to comment on the privacy implications of its use of data, ' + "but said in a statement to The Associated Press that" + ], + ) From 4fdf8c1e25022efd713e9c02d12e28174293f29e Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 7 Feb 2023 11:32:12 -0500 Subject: [PATCH 04/10] fix --- __init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 __init__.py diff --git a/__init__.py b/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 From c25f221fb3bcaf78afa3f1f7d2884297afd3484c Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 7 Feb 2023 12:08:41 -0500 Subject: [PATCH 05/10] fix --- tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py index 52b4669896..936d4c9f76 100644 --- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py +++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py @@ -36,7 +36,7 @@ GPTBigCodeForTokenClassification, GPTBigCodeLMHeadModel, GPTBigCodeModel, - GPTBigCodeTokenizer, + GPT2Tokenizer, ) @@ -526,7 +526,7 @@ def test_gpt_bigcode_weight_initialization(self): def test_batch_generation(self): model = GPTBigCodeLMHeadModel.from_pretrained("gpt2") model.to(torch_device) - tokenizer = GPTBigCodeTokenizer.from_pretrained("gpt2") + tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.padding_side = "left" @@ -585,7 +585,7 @@ def test_batch_generation(self): def test_batch_generation_2heads(self): model = GPTBigCodeDoubleHeadsModel.from_pretrained("gpt2") model.to(torch_device) - tokenizer = GPTBigCodeTokenizer.from_pretrained("gpt2") + tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.padding_side = "left" @@ -699,7 +699,7 @@ def test_lm_generate_gpt_bigcode_with_scale_attn_by_inverse_layer_idx(self): @slow def test_gpt_bigcode_sample(self): - tokenizer = GPTBigCodeTokenizer.from_pretrained("gpt2") + tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPTBigCodeLMHeadModel.from_pretrained("gpt2") model.to(torch_device) @@ -773,7 +773,7 @@ def test_contrastive_search_gpt_bigcode(self): "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based" ) - gpt_bigcode_tokenizer = GPTBigCodeTokenizer.from_pretrained("gpt2-large") + gpt_bigcode_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large") gpt_bigcode_model = GPTBigCodeLMHeadModel.from_pretrained("gpt2-large").to(torch_device) input_ids = gpt_bigcode_tokenizer(article, return_tensors="pt").input_ids.to(torch_device) From 2134a34d7e9b46db9d5379dcd4e4535f6a2c9170 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 7 Feb 2023 14:55:04 -0500 Subject: [PATCH 06/10] Megatron conversion --- .../models/megatron_gpt_bigcode/__init__.py | 17 + ...eckpoint_reshaping_and_interoperability.py | 900 ++++++++++++++++++ .../convert_megatron_gpt2_checkpoint.py | 364 +++++++ 3 files changed, 1281 insertions(+) create mode 100644 src/transformers/models/megatron_gpt_bigcode/__init__.py create mode 100644 src/transformers/models/megatron_gpt_bigcode/checkpoint_reshaping_and_interoperability.py create mode 100644 src/transformers/models/megatron_gpt_bigcode/convert_megatron_gpt2_checkpoint.py diff --git a/src/transformers/models/megatron_gpt_bigcode/__init__.py b/src/transformers/models/megatron_gpt_bigcode/__init__.py new file mode 100644 index 0000000000..8228eea536 --- /dev/null +++ b/src/transformers/models/megatron_gpt_bigcode/__init__.py @@ -0,0 +1,17 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/transformers/models/megatron_gpt_bigcode/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt_bigcode/checkpoint_reshaping_and_interoperability.py new file mode 100644 index 0000000000..f8fb1e7c8f --- /dev/null +++ b/src/transformers/models/megatron_gpt_bigcode/checkpoint_reshaping_and_interoperability.py @@ -0,0 +1,900 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import os +import re +import sys +import types + +import torch + +from transformers import AutoTokenizer, GPT2Config +from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, shard_checkpoint + + +def add_checkpointing_args(parser): + parser.add_argument("--megatron-path", type=str, default=None, help="Base directory of Megatron repository") + parser.add_argument( + "--convert_checkpoint_from_megatron_to_transformers", + action="store_true", + help=( + "If True, convert a Megatron checkpoint to a Transformers checkpoint. " + "If False, convert a Transformers checkpoint to a Megatron checkpoint." + ), + ) + parser.add_argument( + "--load_path", + type=str, + required=True, + help="Path to the checkpoint to convert.", + ) + parser.add_argument( + "--save_path", + type=str, + required=True, + help="Path to the converted checkpoint.", + ) + parser.add_argument("--print-checkpoint-structure", action="store_true") + return parser + + +def add_megatron_checkpoint_args(parser): + parser.add_argument( + "--target_tensor_model_parallel_size", + type=int, + default=1, + help=( + "The tensor model parallel size of the converted checkpoint. " + "Only used when converting a Transformers checkpoint to a Megatron checkpoint." + ), + ) + parser.add_argument( + "--target_pipeline_model_parallel_size", + type=int, + default=1, + help=( + "The pipeline model parallel size of the converted checkpoint. " + "Only used when converting a Transformers checkpoint to a Megatron checkpoint." + ), + ) + parser.add_argument( + "--target_data_parallel_size", + type=int, + default=1, + help=( + "The data parallel size of the converted checkpoint. " + "Only used when converting a Transformers checkpoint to a Megatron checkpoint." + ), + ) + parser.add_argument( + "--target_params_dtype", + type=str, + default="fp32", + help=( + "The dtype of the converted checkpoint. " + "Only used when converting a Transformers checkpoint to a Megatron checkpoint." + ), + ) + parser.add_argument( + "--make_vocab_size_divisible_by", + type=int, + default=128, + help=( + "Pad the vocab size to be divisible by this value. " + "This is added for computational efficieny reasons. " + "Only used when converting a Transformers checkpoint to a Megatron checkpoint." + ), + ) + parser.add_argument( + "--use_distributed_optimizer", + action="store_true", + help=( + "If True, use the distributed optimizer. " + "Only used when converting a Transformers checkpoint to a Megatron checkpoint." + ), + ) + return parser + + +def add_transformers_checkpoint_args(parser): + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help=( + "The name of the pre-trained tokenizer to save. " + "If not None, the tokenizer will be saved. " + "Only used when converting a Megatron checkpoint to a Transformers checkpoint." + ), + ) + parser.add_argument( + "--max_shard_size", + type=str, + default="10GB", + help=( + "The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size " + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`). " + "Only used when converting a Megatron checkpoint to a Transformers checkpoint." + ), + ) + + return parser + + +# The simple map of names for "automated" rules. +megatron_to_transformers = { + "attention.dense": ".attn.c_proj.", + "self_attention.dense": ".attn.c_proj.", + "mlp.dense_h_to_4h": ".mlp.c_fc.", + "mlp.dense_4h_to_h": ".mlp.c_proj.", +} +transformers_to_megatron = {v[1:-1]: k for k, v in megatron_to_transformers.items()} + +tensor_parallel_params = [ + # megatron-lm layers to merge across tp ranks + "self_attention.query_key_value.weight", + "self_attention.query_key_value.bias", + "self_attention.dense.weight", + "mlp.dense_h_to_4h.weight", + "mlp.dense_h_to_4h.bias", + "mlp.dense_4h_to_h.weight", + # deprecated + "attention.query_key_value.weight", + "attention.query_key_value.bias", + "attention.dense.weight", + # transformers layers to split across tp ranks + "attn.c_attn.weight", + "attn.c_attn.bias", + "attn.c_proj.weight", + "mlp.c_fc.weight", + "mlp.c_fc.bias", + "mlp.c_proj.weight", +] + + +def recursive_print(name, val, spaces=0): + """ + Recursively print the structure of a checkpoint. This function is taken from `convert_megatron_gpt2_checkpoint.py` + + Args: + name (str): the name of the current tensor parameter + val (Tuple(int)): the shape of the current tensor parameter + spaces (int): the number of spaces to print before the output for a nested structure + """ + # Format the message. + if name is None: + msg = None + else: + fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}" + msg = fmt.format(name) + + # Print and recurse (if needed). + if isinstance(val, dict): + if msg is not None: + print(msg) + for k in val.keys(): + recursive_print(k, val[k], spaces + 2) + elif isinstance(val, torch.Tensor): + print(msg, ":", val.size()) + else: + print(msg, ":", val) + + +def megatron_to_transformers_fix_query_key_value_ordering( + param, checkpoint_version, num_splits, num_heads, hidden_size +): + """ + Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :] for compatibility with later versions + of NVIDIA Megatron-LM. The inverse operation is performed inside Megatron-LM to read checkpoints: + https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209 If param is the weight tensor of the + self-attention block, the returned tensor will have to be transposed one more time to be read by HuggingFace GPT2. + This function is taken from `convert_megatron_gpt2_checkpoint.py` + + Args: + param (torch.Tensor): the tensor to permute + checkpoint_version (int): the version of the checkpoint. + num_splits (int): the number of projections, usually 3 for (Query, Key, Value) + num_heads (int): the number of attention heads + hidden_size (int): the hidden size per head + """ + + input_shape = param.size() + if checkpoint_version == 1.0: + # version 1.0 stores [num_heads * hidden_size * num_splits, :] + saved_shape = (num_heads, hidden_size, num_splits) + input_shape[1:] + param = param.view(*saved_shape) + param = param.transpose(0, 2) + param = param.transpose(1, 2).contiguous() + elif checkpoint_version >= 2.0: + # other versions store [num_heads * num_splits * hidden_size, :] + saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:] + param = param.view(*saved_shape) + param = param.transpose(0, 1).contiguous() + param = param.view(*input_shape) + return param + + +def transformers_to_megatron_fix_query_key_value_ordering( + param, checkpoint_version, num_splits, num_heads, hidden_size +): + """ + Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM chekpoint versions. Input + is [num_splits * num_heads * hidden_size, :] and output is [num_heads * hidden_size * num_splits, :] for version + 1.0 and [num_heads * num_splits * hidden_size, :] for version 2.0 and later. If param is the weight tensor of the + self-attention block, the param needs to be already transposed before calling this function. + + Args: + param (torch.Tensor): the tensor to permute + checkpoint_version (int): the version of the checkpoint. + num_splits (int): the number of projections, usually 3 for (Query, Key, Value) + num_heads (int): the number of attention heads + hidden_size (int): the hidden size per head + """ + + # Input is [num_splits * num_heads * hidden_size, :] + input_shape = param.size() + if checkpoint_version == 1.0: + # version 1.0 stores [num_heads * hidden_size * num_splits, :] + current_shape = (num_splits, num_heads, hidden_size) + input_shape[1:] + param = param.view(*current_shape) + param = param.transpose(0, 2) + param = param.transpose(1, 2).contiguous() + elif checkpoint_version >= 2.0: + # other versions store [num_heads * num_splits * hidden_size, :] + current_shape = (num_splits, num_heads, hidden_size) + input_shape[1:] + param = param.view(*current_shape) + param = param.transpose(0, 1).contiguous() + param = param.view(*input_shape) + return param + + +def merge_transformers_sharded_states(path, num_checkpoints): + """ + Merge sharded checkpoints from transformers into a single checkpoint. + + Args: + path (str): the path to the sharded checkpoints + num_checkpoints (int): the number of checkpoints to merge + """ + state_dict = {} + for i in range(1, num_checkpoints + 1): + checkpoint_path = os.path.join(path, f"pytorch_model-{i:05d}-of-{num_checkpoints:05d}.bin") + current_chunk = torch.load(checkpoint_path, map_location="cpu") + state_dict.update(current_chunk) + return state_dict + + +def get_megatron_sharded_states(args, tp_size, pp_size, pp_rank): + """ + Get sharded checkpoints from NVIDIA Megatron-LM checkpoint based on the provided tensor parallel size, pipeline + parallel size and pipeline parallel rank. + + Args: + args (argparse.Namespace): the arguments to the script + tp_size (int): the tensor parallel size + pp_size (int): the pipeline parallel size + pp_rank (int): the pipeline parallel rank + """ + tp_state_dicts = [] + for i in range(tp_size): + sub_dir_name = f"mp_rank_{i:02d}" if pp_size == 1 else f"mp_rank_{i:02d}_{pp_rank:03d}" + checkpoint_name = os.listdir(os.path.join(args.load_path, sub_dir_name))[0] + checkpoint_path = os.path.join(args.load_path, sub_dir_name, checkpoint_name) + state_dict = torch.load(checkpoint_path, map_location="cpu") + tp_state_dicts.append(state_dict) + return tp_state_dicts + + +def get_element_from_dict_by_path(d, path): + """ + Get element from dictionary by path. If element is not present, recursively add empty dictionaries. + + Args: + d (dict): the dictionary to get the element from + path (list): the path to the element which is delimited by "." + """ + path = path.split(".") + for k in path: + if k not in d: + d[k] = {} + d = d[k] + return d + + +def convert_checkpoint_from_megatron_to_transformers(args): + """ + Convert NVIDIA Megatron-LM checkpoint to HuggingFace Transformers checkpoint. This handles Megatron checkpoints + with different tensor parallelism and pipeline parallelism sizes. It saves the converted checkpoint into shards + using HuggingFace Transformers checkpoint sharding functionality. This greatly extends the functionality of + `convert_megatron_gpt2_checkpoint.py` + + Args: + args (argparse.Namespace): the arguments to the script + """ + # Load Megatron-LM checkpoint arguments from the state dict + sub_dirs = os.listdir(args.load_path) + possible_sub_dirs = ["mp_rank_00", "mp_rank_00_000"] + for sub_dir in possible_sub_dirs: + if sub_dir in sub_dirs: + rank0_checkpoint_name = os.listdir(os.path.join(args.load_path, sub_dir))[0] + rank0_checkpoint_path = os.path.join(args.load_path, sub_dir, rank0_checkpoint_name) + break + print(f"Loading Megatron-LM checkpoint arguments from: {rank0_checkpoint_path}") + state_dict = torch.load(rank0_checkpoint_path, map_location="cpu") + megatron_args = state_dict.get("args", None) + if megatron_args is None: + raise ValueError( + "Megatron-LM checkpoint does not contain arguments. This utility only supports Megatron-LM checkpoints" + " containing all the megatron arguments. This is because it loads all config related to model" + " architecture, the tensor and pipeline model parallel size from the checkpoint insead of user having to" + " manually specify all the details. Please save Megatron-LM checkpoint along with all the megatron" + " arguments to use this utility." + ) + + # Create Transformers GPT2 config from Megatron-LM arguments + if megatron_args is not None: + if megatron_args.bias_gelu_fusion: + activation_function = "gelu_fast" + elif megatron_args.openai_gelu: + activation_function = "gelu_new" + else: + activation_function = "gelu" + else: + # in the very early days this used to be "gelu_new" + activation_function = "gelu_new" + vocab_size = ( + megatron_args.padded_vocab_size + if getattr(megatron_args, "orig_vocab_size", None) is None + else megatron_args.orig_vocab_size + ) + print(vocab_size) + + config = GPT2Config( + vocab_size=vocab_size, + n_positions=megatron_args.max_position_embeddings, + n_embd=megatron_args.hidden_size, + n_layer=megatron_args.num_layers, + n_head=megatron_args.num_attention_heads, + n_inner=megatron_args.ffn_hidden_size, + activation_function=activation_function, + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + summary_type="cls_index", + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + scale_attn_weights=True, + use_cache=True, + bos_token_id=vocab_size - 1, + eos_token_id=vocab_size - 1, + architectures=["GPT2LMHeadModel"], + ) + + output_state_dict = {} + + checkpoint_version = state_dict.get("checkpoint_version", 0.0) + tp_size = megatron_args.tensor_model_parallel_size + pp_size = megatron_args.pipeline_model_parallel_size + dtype = torch.float32 + # The regex to extract layer names. + layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)") + + # Convert. + print("Converting") + + # Embeddings + print("Converting embeddings") + tp_state_dicts = get_megatron_sharded_states(args, tp_size, pp_size, 0) + + # Convert and store the position embeddings. + position_embeddings = get_element_from_dict_by_path( + tp_state_dicts[0], "model.language_model.embedding.position_embeddings.weight" + ) + output_state_dict["transformer.wpe.weight"] = position_embeddings.to(dtype) + + # Convert and store the word embeddings. + word_embeddings = torch.cat( + [ + get_element_from_dict_by_path( + tp_state_dicts[tp_rank], "model.language_model.embedding.word_embeddings.weight" + ) + for tp_rank in range(tp_size) + ], + dim=0, + ) + word_embeddings = word_embeddings[:vocab_size].to(dtype) + output_state_dict["transformer.wte.weight"] = word_embeddings + + # Transformer Layers + print("Converting transformer layers") + # The number of heads. + heads = config.n_head + # The hidden_size per head. + hidden_size_per_head = config.n_embd // config.n_head + n_positions = config.n_positions + num_layers = config.num_hidden_layers // pp_size + + for pp_rank in range(pp_size): + if pp_size > 0: + print(f"Converting pipeline parallel rank {pp_rank}") + tp_state_dicts = get_megatron_sharded_states(args, tp_size, pp_size, pp_rank) + + # The transformer. + path = ( + "model.language_model.transformer" + if "transformer" in get_element_from_dict_by_path(tp_state_dicts[0], "model.language_model").keys() + else "model.language_model.encoder" + ) + # Extract the layers. + for key, val in get_element_from_dict_by_path(tp_state_dicts[0], path).items(): + # Match the name. + m = layer_re.match(key) + # Stop if that's not a layer + if m is None: + break + + # The index of the layer. + layer_idx = int(m.group(1)) + pp_rank * num_layers + # The name of the operation. + op_name = m.group(2) + # Is it a weight or a bias? + weight_or_bias = m.group(3) + + # The name of the layer. + layer_name = f"transformer.h.{layer_idx}" + + if op_name + "." + weight_or_bias not in tensor_parallel_params: + params = val.to(dtype) + else: + dim = 1 if op_name in ["self_attention.dense", "mlp.dense_4h_to_h", "attention.dense"] else 0 + params = torch.cat( + [val] + + [ + get_element_from_dict_by_path(tp_state_dicts[tp_rank], f"{path}")[key] + for tp_rank in range(1, tp_size) + ], + dim=dim, + ).to(dtype) + + # For layernorm(s), simply store the layer norm. + if op_name.endswith("layernorm"): + + ln_name = "ln_1" if op_name.startswith("input") else "ln_2" + output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = params + + # Transpose the QKV matrix. + elif ( + op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" + ) and weight_or_bias == "weight": + + # Insert a tensor of 1x1xDxD bias. + causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=dtype)).view( + 1, 1, n_positions, n_positions + ) + output_state_dict[layer_name + ".attn.bias"] = causal_mask + + # Insert a "dummy" tensor for masked_bias. + masked_bias = torch.tensor(-1e4, dtype=dtype) + output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias + + out_val = megatron_to_transformers_fix_query_key_value_ordering( + params, + checkpoint_version, + 3, + heads, + hidden_size_per_head, + ) + # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D. + out_val = out_val.transpose(0, 1).contiguous() + # Store. + output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val + + # Transpose the bias. + elif ( + op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" + ) and weight_or_bias == "bias": + + out_val = megatron_to_transformers_fix_query_key_value_ordering( + params, checkpoint_version, 3, heads, hidden_size_per_head + ) + # Store. No change of shape. + output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val + + # Transpose the weights. + elif weight_or_bias == "weight": + + out_name = megatron_to_transformers[op_name] + output_state_dict[layer_name + out_name + "weight"] = params.transpose(0, 1) + + # Copy the bias. + elif weight_or_bias == "bias": + + out_name = megatron_to_transformers[op_name] + output_state_dict[layer_name + out_name + "bias"] = params + + if config.n_layer != (layer_idx + 1): + raise ValueError(f"Expected {config.n_layer} layers but found {layer_idx + 1}") + + # The final layernorm. + print("Converting final layernorm") + params = get_element_from_dict_by_path(tp_state_dicts[0], str(path)) + output_state_dict["transformer.ln_f.weight"] = params["final_layernorm.weight"].to(dtype) + output_state_dict["transformer.ln_f.bias"] = params["final_layernorm.bias"].to(dtype) + + # For LM head, transformers' wants the matrix to weight embeddings. + print("Converting LM head") + output_state_dict["lm_head.weight"] = word_embeddings.to(dtype) + + # It should be done! + print("Conversion from Megatron-LM to Transformers is done!") + + # Print the structure of converted state dict. + if args.print_checkpoint_structure: + recursive_print(None, output_state_dict) + + # Add tokenizer class info to config + # see https://github.com/huggingface/transformers/issues/13906) + + if args.tokenizer_name is None: + tokenizer_name = "gpt2" + else: + tokenizer_name = args.tokenizer_name + + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + tokenizer_class = type(tokenizer).__name__ + config.tokenizer_class = tokenizer_class + + # Store the config to file. + print("Saving config") + config.save_pretrained(args.save_path) + + # Save tokenizer based on args + if args.tokenizer_name is not None: + print(f"Adding {tokenizer_class} tokenizer files") + tokenizer.save_pretrained(args.save_path) + + # Store the state_dict to file. + max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size + shards, index = shard_checkpoint(output_state_dict, max_shard_size=max_shard_size) + + # Save the model + for shard_file, shard in shards.items(): + torch.save(shard, os.path.join(args.save_path, shard_file)) + + if index is None: + print(f"Model weights saved in {os.path.join(args.save_path, WEIGHTS_NAME)}") + else: + save_index_file = os.path.join(args.save_path, WEIGHTS_INDEX_NAME) + # Save the index as well + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + print( + f"The model is bigger than the maximum size per checkpoint ({args.max_shard_size}) and is going to be " + f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the " + f"index located at {save_index_file}." + ) + + +def convert_checkpoint_from_transformers_to_megatron(args): + """ + Convert a checkpoint from HuggingFace Transformers to Megatron-LM. This allows converted checkpoints with variable + tensor parallelism and pipeline parallelism sizes. It takes as input a checkpoint from HuggingFace Transformers + which can have multiple shards. + + Args: + args (argparse.Namespace): the arguments to the script + + """ + os.makedirs(args.save_path, exist_ok=True) + # Search in directory above this + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) + if args.megatron_path is not None: + sys.path.insert(0, args.megatron_path) + + try: + from megatron.tokenizer.tokenizer import _vocab_size_with_padding + except ModuleNotFoundError: + print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") + exit(1) + + # load the transformers model state dict and config + sub_dirs = [x for x in os.listdir(args.load_path) if x.startswith("pytorch_model")] + if len(sub_dirs) == 1: + checkpoint_name = "pytorch_model.bin" + state_dict = torch.load(os.path.join(args.load_path, checkpoint_name), map_location="cpu") + else: + num_checkpoints = len(sub_dirs) - 1 + state_dict = merge_transformers_sharded_states(args.load_path, num_checkpoints) + + config = GPT2Config.from_pretrained(args.load_path) + + # Saving the tracker file + tracker_filepath = os.path.join(args.save_path, "latest_checkpointed_iteration.txt") + with open(tracker_filepath, "w") as f: + f.write("release") + + # create `release` dir in args.load_path + release_dir = os.path.join(args.save_path, "release") + os.makedirs(release_dir, exist_ok=True) + + # megatron args + megatron_args = { + "orig_vocab_size": config.vocab_size, + "max_position_embeddings": config.n_positions, + "hidden_size": config.n_embd, + "num_layers": config.n_layer, + "num_attention_heads": config.n_head, + "ffn_hidden_size": config.n_inner, + "tensor_model_parallel_size": args.target_tensor_model_parallel_size, + "pipeline_model_parallel_size": args.target_pipeline_model_parallel_size, + "data_parallel_size": args.target_data_parallel_size, + "make_vocab_size_divisible_by": args.make_vocab_size_divisible_by, + "rank": 0, + "tokenizer_type": "GPT2BPETokenizer", + } + + if config.activation_function == "gelu": + megatron_args["bias_gelu_fusion"] = False + megatron_args["openai_gelu"] = False + elif config.activation_function == "gelu_fast": + megatron_args["bias_gelu_fusion"] = True + megatron_args["openai_gelu"] = False + elif config.activation_function == "gelu_new": + megatron_args["bias_gelu_fusion"] = False + megatron_args["openai_gelu"] = True + + margs = types.SimpleNamespace() + for k, v in megatron_args.items(): + setattr(margs, k, v) + + # params dtype + if args.target_params_dtype == "fp16": + dtype = torch.float16 + elif args.target_params_dtype == "bf16": + dtype = torch.bfloat16 + else: + dtype = torch.float32 + setattr(margs, "params_dtype", dtype) + + # save dummy optim state dict + dummy_optim_state_dict = {} + dummy_optim_state_dict["optimizer"] = { + "step": 0, + "param_groups": [ + { + "lr": 0.0, + "beta1": 0.0, + "beta2": 0.0, + "eps": 0.0, + "weight_decay": 0.0, + "correct_bias": False, + "params": [], + } + ], + } + if args.use_distributed_optimizer: + for i in range(args.target_pipeline_model_parallel_size): + for j in range(args.target_tensor_model_parallel_size): + for k in range(args.target_data_parallel_size): + if args.target_pipeline_model_parallel_size == 1: + checkpoint_dir = f"mp_rank_{i:02d}_{k:03d}" + else: + checkpoint_dir = f"mp_rank_{i:02d}_{j:03d}_{k:03d}" + checkpoint_dir = os.path.join(release_dir, checkpoint_dir) + os.makedirs(checkpoint_dir, exist_ok=True) + torch.save( + dummy_optim_state_dict, + os.path.join(checkpoint_dir, "optim.pt"), + ) + + # Convert. + print("Converting") + output_state_dict = [] + for i in range(args.target_tensor_model_parallel_size): + output_state_dict.append({}) + + # Embedding layer + print("converting embedding layer") + pos_embedding = state_dict["transformer.wpe.weight"].to(dtype) + word_embedding = state_dict["transformer.wte.weight"].to(dtype) + orig_vocab_size = config.vocab_size + padded_vocab_size = _vocab_size_with_padding(orig_vocab_size, margs) + setattr(margs, "padded_vocab_size", padded_vocab_size) + # Cut out extra padding we don't need + if orig_vocab_size > padded_vocab_size: + full_word_embed = word_embedding[0:padded_vocab_size, :] + # Expanding embedding to larger size by replicating final entry + elif orig_vocab_size < padded_vocab_size: + padding_size = padded_vocab_size - orig_vocab_size + full_word_embed = torch.cat((word_embedding, word_embedding[-1].unsqueeze(0).expand(padding_size, -1))) + # Same size! + else: + full_word_embed = word_embedding + + # Split into new tensor model parallel sizes + out_word_embed = torch.chunk(full_word_embed, args.target_tensor_model_parallel_size, dim=0) + for i in range(args.target_tensor_model_parallel_size): + pos_emb_dict = get_element_from_dict_by_path( + output_state_dict[i], "model.language_model.embedding.position_embeddings" + ) + pos_emb_dict["weight"] = pos_embedding + + word_emb_dict = get_element_from_dict_by_path( + output_state_dict[i], "model.language_model.embedding.word_embeddings" + ) + word_emb_dict["weight"] = out_word_embed[i] + + # Transformer layers + print("converting transformer layers") + if config.num_hidden_layers % args.target_tensor_model_parallel_size != 0: + raise ValueError( + f"Number of layers ({config.num_hidden_layers}) must be divisible by number of tensor parallelism" + f" ({args.target_tensor_model_parallel_size})" + ) + num_layers = config.num_hidden_layers // args.target_pipeline_model_parallel_size + + layer_re = re.compile("transformer.h\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)") + # The number of heads. + heads = config.n_head + # The hidden_size per head. + hidden_size_per_head = config.n_embd // config.n_head + for pp_rank in range(args.target_pipeline_model_parallel_size): + layer_offset = pp_rank * num_layers + if pp_rank > 0: + output_state_dict = [] + for i in range(args.target_tensor_model_parallel_size): + output_state_dict.append({}) + + for layer in range(num_layers): + pp_layer_id = layer + layer_offset + layers_to_copy = [ + layer_name + for layer_name in state_dict.keys() + if layer_name.startswith(f"transformer.h.{pp_layer_id}.") + ] + + for layer_name in layers_to_copy: + m = layer_re.match(layer_name) + # Stop if that's not a layer + if m is None: + break + + # The index of the layer. + _ = int(m.group(1)) + # The name of the operation. + op_name = m.group(2) + # Is it a weight or a bias? + weight_or_bias = m.group(3) + + params = state_dict[layer_name].to(dtype) + # handle layernorm + if op_name.startswith("ln"): + out_name = "input_layernorm" if op_name.endswith("1") else "post_attention_layernorm" + layer_name = f"layers.{layer}.{out_name}.{weight_or_bias}" + + # handle attention K, V, Q weights + elif op_name.startswith("attn.c_attn") and weight_or_bias == "weight": + # transformers stores D X (3*D) but Megatron-LM expects (3*D) X D. + params = params.transpose(0, 1).contiguous() + + params = transformers_to_megatron_fix_query_key_value_ordering( + params, + 3.0, + 3, + heads, + hidden_size_per_head, + ) + layer_name = f"layers.{layer}.self_attention.query_key_value.{weight_or_bias}" + + # handle attention K, V, Q bias + elif op_name.startswith("attn.c_attn") and weight_or_bias == "bias": + params = transformers_to_megatron_fix_query_key_value_ordering( + params, + 3.0, + 3, + heads, + hidden_size_per_head, + ) + layer_name = f"layers.{layer}.self_attention.query_key_value.{weight_or_bias}" + + # handle attention and mlp weights + elif weight_or_bias == "weight": + out_name = transformers_to_megatron.get(op_name, None) + if out_name is None: + continue + params = params.transpose(0, 1) + layer_name = f"layers.{layer}.{out_name}.{weight_or_bias}" + + # handle attention and mlp bias + elif weight_or_bias == "bias": + out_name = transformers_to_megatron.get(op_name, None) + if out_name is None: + continue + layer_name = f"layers.{layer}.{out_name}.{weight_or_bias}" + + # skip + else: + continue + + if op_name + "." + weight_or_bias in tensor_parallel_params: + dim = 1 if op_name in ["attn.c_proj", "mlp.c_proj"] else 0 + params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=dim) + + for i in range(args.target_tensor_model_parallel_size): + params_dict = get_element_from_dict_by_path(output_state_dict[i], "model.language_model.encoder") + params_dict[layer_name] = ( + params[i] if (op_name + "." + weight_or_bias in tensor_parallel_params) else params + ) + + if pp_rank == args.target_pipeline_model_parallel_size - 1: + # handle final layernorm + for weight_or_bias in ["weight", "bias"]: + params = state_dict[f"transformer.ln_f.{weight_or_bias}"].to(dtype) + layer_name = f"final_layernorm.{weight_or_bias}" + for i in range(args.target_tensor_model_parallel_size): + params_dict = get_element_from_dict_by_path(output_state_dict[i], "model.language_model.encoder") + params_dict[layer_name] = params + + # add the LM head + for i in range(args.target_tensor_model_parallel_size): + params_dict = get_element_from_dict_by_path(output_state_dict[i], "model.word_embeddings_for_head") + params_dict["weight"] = out_word_embed[i] + + # saving the state dict as per the tp_rank and pp_rank + for tp_rank in range(args.target_tensor_model_parallel_size): + output_state_dict[tp_rank]["checkpoint_version"] = 3.0 + output_state_dict[tp_rank]["args"] = margs + checkpoint_dir = ( + f"mp_rank_{tp_rank:02d}" + if args.target_pipeline_model_parallel_size == 1 + else f"mp_rank_{tp_rank:02d}_{pp_rank:03d}" + ) + if args.use_distributed_optimizer: + checkpoint_name = "model_rng.pt" + else: + checkpoint_name = "model_optim_rng.pt" + output_state_dict[tp_rank]["optimizer"] = dummy_optim_state_dict["optimizer"] + checkpoint_dir = os.path.join(release_dir, checkpoint_dir) + os.makedirs(checkpoint_dir, exist_ok=True) + checkpoint_path = os.path.join(checkpoint_dir, checkpoint_name) + if args.print_checkpoint_structure: + print( + f"Checkpoint structure of model state dict shard belonging to TP rank {tp_rank} and PP rank" + f" {pp_rank}:" + ) + recursive_print(None, output_state_dict[tp_rank]) + torch.save(output_state_dict[tp_rank], checkpoint_path) + + +def main(): + parser = argparse.ArgumentParser() + parser = add_checkpointing_args(parser) + parser = add_megatron_checkpoint_args(parser) + parser = add_transformers_checkpoint_args(parser) + args = parser.parse_args() + if args.convert_checkpoint_from_megatron_to_transformers: + convert_checkpoint_from_megatron_to_transformers(args) + else: + convert_checkpoint_from_transformers_to_megatron(args) + + +if __name__ == "__main__": + main() diff --git a/src/transformers/models/megatron_gpt_bigcode/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt_bigcode/convert_megatron_gpt2_checkpoint.py new file mode 100644 index 0000000000..778b1384a2 --- /dev/null +++ b/src/transformers/models/megatron_gpt_bigcode/convert_megatron_gpt2_checkpoint.py @@ -0,0 +1,364 @@ +#################################################################################################### + +# Copyright (c) 2021-, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#################################################################################################### + +# +# Note: If when running this conversion script you're getting an exception: +# ModuleNotFoundError: No module named 'megatron.model.enums' +# you need to tell python where to find the clone of Megatron-LM, e.g.: +# +# cd /tmp +# git clone https://github.com/NVIDIA/Megatron-LM +# PYTHONPATH=/tmp/Megatron-LM python src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py ... +# +# if you already have it cloned elsewhere, simply adjust the path to the existing path +# +# If the training was done using a Megatron-LM fork, e.g., +# https://github.com/microsoft/Megatron-DeepSpeed/ then chances are that you need to have that one +# in your path, i.e., /path/to/Megatron-DeepSpeed/ +# + +import argparse +import os +import re +import zipfile + +import torch + +from transformers import AutoTokenizer, GPT2Config + + +#################################################################################################### + + +def recursive_print(name, val, spaces=0): + # Format the message. + if name is None: + msg = None + else: + fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}" + msg = fmt.format(name) + + # Print and recurse (if needed). + if isinstance(val, dict): + if msg is not None: + print(msg) + for k in val.keys(): + recursive_print(k, val[k], spaces + 2) + elif isinstance(val, torch.Tensor): + print(msg, ":", val.size()) + else: + print(msg, ":", val) + + +def fix_query_key_value_ordering(param, checkpoint_version, num_splits, num_heads, hidden_size): + # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :] + # for compatibility with later versions of NVIDIA Megatron-LM. + # The inverse operation is performed inside Megatron-LM to read checkpoints: + # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209 + # If param is the weight tensor of the self-attention block, the returned tensor + # will have to be transposed one more time to be read by HuggingFace GPT2. + input_shape = param.size() + if checkpoint_version == 1.0: + # version 1.0 stores [num_heads * hidden_size * num_splits, :] + saved_shape = (num_heads, hidden_size, num_splits) + input_shape[1:] + param = param.view(*saved_shape) + param = param.transpose(0, 2) + param = param.transpose(1, 2).contiguous() + elif checkpoint_version >= 2.0: + # other versions store [num_heads * num_splits * hidden_size, :] + saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:] + param = param.view(*saved_shape) + param = param.transpose(0, 1).contiguous() + param = param.view(*input_shape) + return param + + +#################################################################################################### + + +def convert_megatron_checkpoint(args, input_state_dict, config): + # The converted output model. + output_state_dict = {} + + # old versions did not store training args + ds_args = input_state_dict.get("args", None) + if ds_args is not None: + # do not make the user write a config file when the exact dimensions/sizes are already in the checkpoint + # from pprint import pprint + # pprint(vars(ds_args)) + + config.vocab_size = ds_args.padded_vocab_size + config.n_positions = ds_args.max_position_embeddings + config.n_embd = ds_args.hidden_size + config.n_layer = ds_args.num_layers + config.n_head = ds_args.num_attention_heads + config.n_inner = ds_args.ffn_hidden_size + # pprint(config) + + # The number of heads. + heads = config.n_head + # The hidden_size per head. + hidden_size_per_head = config.n_embd // config.n_head + # Megatron-LM checkpoint version + if "checkpoint_version" in input_state_dict.keys(): + checkpoint_version = input_state_dict["checkpoint_version"] + else: + checkpoint_version = 0.0 + + # The model. + model = input_state_dict["model"] + # The language model. + lm = model["language_model"] + # The embeddings. + embeddings = lm["embedding"] + + # The word embeddings. + word_embeddings = embeddings["word_embeddings"]["weight"] + # Truncate the embedding table to vocab_size rows. + word_embeddings = word_embeddings[: config.vocab_size, :] + output_state_dict["transformer.wte.weight"] = word_embeddings + + # The position embeddings. + pos_embeddings = embeddings["position_embeddings"]["weight"] + # Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size] + n_positions = pos_embeddings.size(0) + if n_positions != config.n_positions: + raise ValueError( + f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match" + ) + # Store the position embeddings. + output_state_dict["transformer.wpe.weight"] = pos_embeddings + + # The transformer. + transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"] + + # The regex to extract layer names. + layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)") + + # The simple map of names for "automated" rules. + megatron_to_transformers = { + "attention.dense": ".attn.c_proj.", + "self_attention.dense": ".attn.c_proj.", + "mlp.dense_h_to_4h": ".mlp.c_fc.", + "mlp.dense_4h_to_h": ".mlp.c_proj.", + } + + # Extract the layers. + for key, val in transformer.items(): + # Match the name. + m = layer_re.match(key) + + # Stop if that's not a layer + if m is None: + break + + # The index of the layer. + layer_idx = int(m.group(1)) + # The name of the operation. + op_name = m.group(2) + # Is it a weight or a bias? + weight_or_bias = m.group(3) + + # The name of the layer. + layer_name = f"transformer.h.{layer_idx}" + + # For layernorm(s), simply store the layer norm. + if op_name.endswith("layernorm"): + + ln_name = "ln_1" if op_name.startswith("input") else "ln_2" + output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val + + # Transpose the QKV matrix. + elif ( + op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" + ) and weight_or_bias == "weight": + + # Insert a tensor of 1x1xDxD bias. + causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view( + 1, 1, n_positions, n_positions + ) + output_state_dict[layer_name + ".attn.bias"] = causal_mask + + # Insert a "dummy" tensor for masked_bias. + masked_bias = torch.tensor(-1e4, dtype=torch.float16) + output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias + + out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head) + # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D. + out_val = out_val.transpose(0, 1).contiguous() + # Store. + output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val + + # Transpose the bias. + elif ( + op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" + ) and weight_or_bias == "bias": + + out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head) + # Store. No change of shape. + output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val + + # Transpose the weights. + elif weight_or_bias == "weight": + + out_name = megatron_to_transformers[op_name] + output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1) + + # Copy the bias. + elif weight_or_bias == "bias": + + out_name = megatron_to_transformers[op_name] + output_state_dict[layer_name + out_name + "bias"] = val + + # DEBUG. + assert config.n_layer == layer_idx + 1 + + # The final layernorm. + output_state_dict["transformer.ln_f.weight"] = transformer["final_layernorm.weight"] + output_state_dict["transformer.ln_f.bias"] = transformer["final_layernorm.bias"] + + # For LM head, transformers' wants the matrix to weight embeddings. + output_state_dict["lm_head.weight"] = word_embeddings + + # It should be done! + return output_state_dict + + +#################################################################################################### + + +def main(): + # Create the argument parser. + parser = argparse.ArgumentParser() + parser.add_argument("--print-checkpoint-structure", action="store_true") + parser.add_argument( + "path_to_checkpoint", + type=str, + help="Path to the checkpoint file (.zip archive or direct .pt file)", + ) + parser.add_argument( + "--config_file", + default="", + type=str, + help="An optional config json file describing the pre-trained model.", + ) + args = parser.parse_args() + + # Extract the basename. + basename = os.path.dirname(args.path_to_checkpoint) + + # Load the model. + # the .zip is very optional, let's keep it for backward compatibility + print(f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}") + if args.path_to_checkpoint.endswith(".zip"): + with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint: + with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict: + input_state_dict = torch.load(pytorch_dict, map_location="cpu") + else: + input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu") + + ds_args = input_state_dict.get("args", None) + + # Read the config, or default to the model released by NVIDIA. + if args.config_file == "": + + if ds_args is not None: + if ds_args.bias_gelu_fusion: + activation_function = "gelu_fast" + elif ds_args.openai_gelu: + activation_function = "gelu_new" + else: + activation_function = "gelu" + else: + # in the very early days this used to be "gelu_new" + activation_function = "gelu_new" + + # Spell out all parameters in case the defaults change. + config = GPT2Config( + vocab_size=50257, + n_positions=1024, + n_embd=1024, + n_layer=24, + n_head=16, + n_inner=4096, + activation_function=activation_function, + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + summary_type="cls_index", + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + scale_attn_weights=True, + use_cache=True, + bos_token_id=50256, + eos_token_id=50256, + ) + else: + config = GPT2Config.from_json_file(args.config_file) + + config.architectures = ["GPT2LMHeadModel"] + + # Convert. + print("Converting") + output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config) + + # Print the structure of converted state dict. + if args.print_checkpoint_structure: + recursive_print(None, output_state_dict) + + # Add tokenizer class info to config + # see https://github.com/huggingface/transformers/issues/13906) + if ds_args is not None: + tokenizer_type = ds_args.tokenizer_type + if tokenizer_type == "GPT2BPETokenizer": + tokenizer_model_name = "gpt2" + elif tokenizer_type == "PretrainedFromHF": + tokenizer_model_name = ds_args.tokenizer_name_or_path + else: + raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}") + else: + tokenizer_model_name = "gpt2" + + tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name) + tokenizer_class = type(tokenizer).__name__ + config.tokenizer_class = tokenizer_class + + # Store the config to file. + print("Saving config") + config.save_pretrained(basename) + + # Save tokenizer based on args + print(f"Adding {tokenizer_class} tokenizer files") + tokenizer.save_pretrained(basename) + + # Store the state_dict to file. + output_checkpoint_file = os.path.join(basename, "pytorch_model.bin") + print(f'Saving checkpoint to "{output_checkpoint_file}"') + torch.save(output_state_dict, output_checkpoint_file) + + +#################################################################################################### + +if __name__ == "__main__": + main() + +#################################################################################################### From 33bb045f86736eac86c8c36824616c98ab55ee13 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 7 Feb 2023 14:55:13 -0500 Subject: [PATCH 07/10] Add todos --- .../checkpoint_reshaping_and_interoperability.py | 2 ++ .../megatron_gpt_bigcode/convert_megatron_gpt2_checkpoint.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/transformers/models/megatron_gpt_bigcode/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt_bigcode/checkpoint_reshaping_and_interoperability.py index f8fb1e7c8f..9292cb505c 100644 --- a/src/transformers/models/megatron_gpt_bigcode/checkpoint_reshaping_and_interoperability.py +++ b/src/transformers/models/megatron_gpt_bigcode/checkpoint_reshaping_and_interoperability.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# TODO: Update this file + import argparse import json import os diff --git a/src/transformers/models/megatron_gpt_bigcode/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt_bigcode/convert_megatron_gpt2_checkpoint.py index 778b1384a2..0484f8994f 100644 --- a/src/transformers/models/megatron_gpt_bigcode/convert_megatron_gpt2_checkpoint.py +++ b/src/transformers/models/megatron_gpt_bigcode/convert_megatron_gpt2_checkpoint.py @@ -32,6 +32,8 @@ # in your path, i.e., /path/to/Megatron-DeepSpeed/ # +# TODO: Update this file + import argparse import os import re From e92b63f3e73b2741ccca1db04cee1fb19a6d759e Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 7 Feb 2023 15:13:47 -0500 Subject: [PATCH 08/10] fix --- .../gpt_bigcode/modeling_gpt_bigcode.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index 8625e77759..942cea744b 100644 --- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -520,7 +520,7 @@ class GPTBigCodeDoubleHeadsModelOutput(ModelOutput): attentions: Optional[Tuple[torch.FloatTensor]] = None -GPTBigCode_START_DOCSTRING = r""" +GPT_BIGCODE_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads @@ -536,7 +536,7 @@ class GPTBigCodeDoubleHeadsModelOutput(ModelOutput): configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ -GPTBigCode_INPUTS_DOCSTRING = r""" +GPT_BIGCODE_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): `input_ids_length` = `sequence_length` if `past_key_values` is `None` else @@ -657,7 +657,7 @@ class GPTBigCodeDoubleHeadsModelOutput(ModelOutput): @add_start_docstrings( "The bare GPTBigCode Model transformer outputting raw hidden-states without any specific head on top.", - GPTBigCode_START_DOCSTRING, + GPT_BIGCODE_START_DOCSTRING, ) class GPTBigCodeModel(GPTBigCodePreTrainedModel): _keys_to_ignore_on_load_missing = ["attn.masked_bias"] @@ -728,7 +728,7 @@ def _prune_heads(self, heads_to_prune): for layer, heads in heads_to_prune.items(): self.h[layer].attn.prune_heads(heads) - @add_start_docstrings_to_model_forward(GPTBigCode_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, output_type=BaseModelOutputWithPastAndCrossAttentions, @@ -934,7 +934,7 @@ def custom_forward(*inputs): The GPTBigCode Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, - GPTBigCode_START_DOCSTRING, + GPT_BIGCODE_START_DOCSTRING, ) class GPTBigCodeLMHeadModel(GPTBigCodePreTrainedModel): _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] @@ -1005,7 +1005,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg "token_type_ids": token_type_ids, } - @add_start_docstrings_to_model_forward(GPTBigCode_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, output_type=CausalLMOutputWithCrossAttentions, @@ -1102,7 +1102,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). """, - GPTBigCode_START_DOCSTRING, + GPT_BIGCODE_START_DOCSTRING, ) class GPTBigCodeDoubleHeadsModel(GPTBigCodePreTrainedModel): _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] @@ -1178,7 +1178,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg "token_type_ids": token_type_ids, } - @add_start_docstrings_to_model_forward(GPTBigCode_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=GPTBigCodeDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, @@ -1316,7 +1316,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in each row of the batch). """, - GPTBigCode_START_DOCSTRING, + GPT_BIGCODE_START_DOCSTRING, ) class GPTBigCodeForSequenceClassification(GPTBigCodePreTrainedModel): _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] @@ -1334,7 +1334,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - @add_start_docstrings_to_model_forward(GPTBigCode_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING) @add_code_sample_docstrings( checkpoint="microsoft/DialogRPT-updown", output_type=SequenceClassifierOutputWithPast, @@ -1441,7 +1441,7 @@ def forward( GPTBigCode Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - GPTBigCode_START_DOCSTRING, + GPT_BIGCODE_START_DOCSTRING, ) class GPTBigCodeForTokenClassification(GPTBigCodePreTrainedModel): def __init__(self, config): @@ -1465,7 +1465,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - @add_start_docstrings_to_model_forward(GPTBigCode_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING) # fmt: off @add_code_sample_docstrings( checkpoint="brad1141/gpt2-finetuned-comp2", From 2f32703ef4047617e88eada0c338c0b996f6d304 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 7 Feb 2023 15:38:20 -0500 Subject: [PATCH 09/10] format --- .../models/gpt_bigcode/__init__.py | 18 +++++++++++------- .../gpt_bigcode/test_modeling_gpt_bigcode.py | 14 +++++++++++--- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/gpt_bigcode/__init__.py b/src/transformers/models/gpt_bigcode/__init__.py index 5b585a8908..2af1863d70 100644 --- a/src/transformers/models/gpt_bigcode/__init__.py +++ b/src/transformers/models/gpt_bigcode/__init__.py @@ -18,15 +18,15 @@ from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_torch_available, -) +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available _import_structure = { - "configuration_gpt_bigcode": ["GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTBigCodeConfig", "GPTBigCodeOnnxConfig"], + "configuration_gpt_bigcode": [ + "GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP", + "GPTBigCodeConfig", + "GPTBigCodeOnnxConfig", + ], } try: @@ -47,7 +47,11 @@ ] if TYPE_CHECKING: - from .configuration_gpt_bigcode import GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTBigCodeConfig, GPTBigCodeOnnxConfig + from .configuration_gpt_bigcode import ( + GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP, + GPTBigCodeConfig, + GPTBigCodeOnnxConfig, + ) try: if not is_torch_available(): diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py index 936d4c9f76..f5f31ce05f 100644 --- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py +++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py @@ -31,12 +31,12 @@ from transformers import ( GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST, + GPT2Tokenizer, GPTBigCodeDoubleHeadsModel, GPTBigCodeForSequenceClassification, GPTBigCodeForTokenClassification, GPTBigCodeLMHeadModel, GPTBigCodeModel, - GPT2Tokenizer, ) @@ -434,12 +434,20 @@ class GPTBigCodeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test # TODO: Update the tests to use valid pretrained models. all_model_classes = ( - (GPTBigCodeModel, GPTBigCodeLMHeadModel, GPTBigCodeDoubleHeadsModel, GPTBigCodeForSequenceClassification, GPTBigCodeForTokenClassification) + ( + GPTBigCodeModel, + GPTBigCodeLMHeadModel, + GPTBigCodeDoubleHeadsModel, + GPTBigCodeForSequenceClassification, + GPTBigCodeForTokenClassification, + ) if is_torch_available() else () ) all_generative_model_classes = (GPTBigCodeLMHeadModel, GPTBigCodeDoubleHeadsModel) if is_torch_available() else () - all_parallelizable_model_classes = (GPTBigCodeLMHeadModel, GPTBigCodeDoubleHeadsModel) if is_torch_available() else () + all_parallelizable_model_classes = ( + (GPTBigCodeLMHeadModel, GPTBigCodeDoubleHeadsModel) if is_torch_available() else () + ) fx_compatible = True test_missing_keys = False test_model_parallel = True From 035c73c89c72744baf0587d0f916811040331349 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 7 Feb 2023 15:55:47 -0500 Subject: [PATCH 10/10] fix --- ...2_checkpoint.py => convert_megatron_gpt_bigcode_checkpoint.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/transformers/models/megatron_gpt_bigcode/{convert_megatron_gpt2_checkpoint.py => convert_megatron_gpt_bigcode_checkpoint.py} (100%) diff --git a/src/transformers/models/megatron_gpt_bigcode/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt_bigcode/convert_megatron_gpt_bigcode_checkpoint.py similarity index 100% rename from src/transformers/models/megatron_gpt_bigcode/convert_megatron_gpt2_checkpoint.py rename to src/transformers/models/megatron_gpt_bigcode/convert_megatron_gpt_bigcode_checkpoint.py