diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 477ccff95052..ec3ae674300d 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -329,6 +329,7 @@ def main(): "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, + "ort": True if training_args.ort else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 242e83427389..f18a935d3917 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -299,6 +299,7 @@ def main(): cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, + ort = training_args.ort, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index c35b636d7dd9..322e6c45f2a8 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -376,6 +376,7 @@ def main(): cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, + ort=True if training_args.ort else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index b15a0378ca7d..226a48fdd1e1 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -337,6 +337,7 @@ def main(): cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, + ort=True if training_args.ort else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index 6f2630104f7e..af5697d0cb8a 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -341,6 +341,7 @@ def main(): cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, + ort=True if training_args.ort else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index f66b5734bd98..c9a05d38ad0a 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -236,6 +236,10 @@ class PretrainedConfig(PushToHubMixin): use_bfloat16 (`bool`, *optional*, defaults to `False`): Whether or not the model should use BFloat16 scalars (only used by some TensorFlow models). + + Onnxruntime specific parameters + + - **ort** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should use ORT. """ model_type: str = "" is_composition: bool = False @@ -260,6 +264,7 @@ def __init__(self, **kwargs): self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models self.torch_dtype = kwargs.pop("torch_dtype", None) # Only used by PyTorch models self.use_bfloat16 = kwargs.pop("use_bfloat16", False) + self.ort = kwargs.pop("ort", False) self.pruned_heads = kwargs.pop("pruned_heads", {}) self.tie_word_embeddings = kwargs.pop( "tie_word_embeddings", True diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index c779267b7b38..996305081ed2 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -35,6 +35,7 @@ from ...pytorch_utils import softmax_backward_data from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_deberta_v2 import DebertaV2Config +from .jit_tracing import traceable logger = logging.get_logger(__name__) @@ -56,7 +57,10 @@ class ContextPooler(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size) - self.dropout = StableDropout(config.pooler_dropout) + if config.ort: + self.dropout = TorchNNDropout(config.pooler_dropout) + else: + self.dropout = StableDropout(config.pooler_dropout) self.config = config def forward(self, hidden_states): @@ -74,6 +78,7 @@ def output_dim(self): return self.config.hidden_size +@traceable # Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2 class XSoftmax(torch.autograd.Function): """ @@ -164,7 +169,7 @@ def get_mask(input, local_context): return mask, dropout - +@traceable # Copied from transformers.models.deberta.modeling_deberta.XDropout class XDropout(torch.autograd.Function): """Optimized dropout function to save computation and memory by using mask operation instead of multiplication.""" @@ -187,6 +192,9 @@ def backward(ctx, grad_output): else: return grad_output, None +class TorchNNDropout(torch.nn.Dropout): + def __init__(self, drop_prob): + super().__init__(drop_prob) # Copied from transformers.models.deberta.modeling_deberta.StableDropout class StableDropout(nn.Module): @@ -244,7 +252,10 @@ def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) + if config.ort: + self.dropout = TorchNNDropout(config.hidden_dropout_prob) + else: + self.dropout = StableDropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -312,7 +323,10 @@ def __init__(self, config): super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) + if config.ort: + self.dropout = TorchNNDropout(config.hidden_dropout_prob) + else: + self.dropout = StableDropout(config.hidden_dropout_prob) self.config = config def forward(self, hidden_states, input_tensor): @@ -367,7 +381,10 @@ def __init__(self, config): config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups ) self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) + if config.ort: + self.dropout = TorchNNDropout(config.hidden_dropout_prob) + else: + self.dropout = StableDropout(config.hidden_dropout_prob) self.config = config def forward(self, hidden_states, residual_states, input_mask): @@ -624,8 +641,10 @@ def __init__(self, config): self.pos_ebd_size = self.max_relative_positions if self.position_buckets > 0: self.pos_ebd_size = self.position_buckets - - self.pos_dropout = StableDropout(config.hidden_dropout_prob) + if config.ort: + self.pos_dropout = TorchNNDropout(config.hidden_dropout_prob) + else: + self.pos_dropout = StableDropout(config.hidden_dropout_prob) if not self.share_att_key: if "c2p" in self.pos_att_type: @@ -633,7 +652,10 @@ def __init__(self, config): if "p2c" in self.pos_att_type: self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size) - self.dropout = StableDropout(config.attention_probs_dropout_prob) + if config.ort: + self.dropout = TorchNNDropout(config.attention_probs_dropout_prob) + else: + self.dropout = StableDropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x, attention_heads): new_x_shape = x.size()[:-1] + (attention_heads, -1) @@ -824,7 +846,10 @@ def __init__(self, config): if self.embedding_size != config.hidden_size: self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False) self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) + if config.ort: + self.dropout = TorchNNDropout(config.hidden_dropout_prob) + else: + self.dropout = StableDropout(config.hidden_dropout_prob) self.config = config # position_ids (1, len position emb) is contiguous in memory and exported when serialized @@ -1237,7 +1262,10 @@ def __init__(self, config): self.classifier = nn.Linear(output_dim, num_labels) drop_out = getattr(config, "cls_dropout", None) drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out - self.dropout = StableDropout(drop_out) + if config.ort: + self.dropout = TorchNNDropout(drop_out) + else: + self.dropout = StableDropout(drop_out) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index 3b5f6a9a6ba3..f792906a116c 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -1467,6 +1467,7 @@ class RobertaForQuestionAnswering(RobertaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels + self.ort = config.ort self.roberta = RobertaModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) @@ -1536,7 +1537,7 @@ def forward( if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) + ignored_index = start_logits.size(1) if not self.ort else 344 start_positions = start_positions.clamp(0, ignored_index) end_positions = end_positions.clamp(0, ignored_index) diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 630e9dd17aa5..6091f3e6cc4b 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -276,12 +276,37 @@ def forward(self, hidden_states): pass +class T5ClampedDropout(nn.Module): + def __init__(self, config): + super().__init__() + self.ort = config.ort + self.dropout = nn.Dropout(config.dropout_rate) + self.dropout_rate = config.dropout_rate + + def forward(self, hidden_states): + # clamp inf values to enable fp16 training + if self.ort: + # Remove data-based control flow for static graph + if hidden_states.dtype == torch.float16: + clamp_value = torch.where(torch.isinf(hidden_states).any(), torch.finfo(hidden_states.dtype).max - 1000, + torch.finfo(hidden_states.dtype).max) + clamp_value = (1.0-self.dropout_rate)*clamp_value + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + else: + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + hidden_states = self.dropout(hidden_states) + return hidden_states + + class T5DenseReluDense(nn.Module): def __init__(self, config: T5Config): super().__init__() self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) - self.dropout = nn.Dropout(config.dropout_rate) + self.dropout = T5ClampedDropout(config) self.relu_act = ACT2FN["relu"] def forward(self, hidden_states): @@ -298,7 +323,7 @@ def __init__(self, config: T5Config): self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) - self.dropout = nn.Dropout(config.dropout_rate) + self.dropout = T5ClampedDropout(config) self.gelu_act = ACT2FN["gelu_new"] def forward(self, hidden_states): @@ -323,7 +348,7 @@ def __init__(self, config: T5Config): ) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) + self.dropout = T5ClampedDropout(config) def forward(self, hidden_states): forwarded_states = self.layer_norm(hidden_states) @@ -560,7 +585,7 @@ def __init__(self, config, has_relative_attention_bias=False): super().__init__() self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) + self.dropout = T5ClampedDropout(config) def forward( self, @@ -592,7 +617,7 @@ def __init__(self, config): super().__init__() self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) + self.dropout = T5ClampedDropout(config) def forward( self, @@ -627,6 +652,7 @@ class T5Block(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super().__init__() self.is_decoder = config.is_decoder + self.ort = config.ort self.layer = nn.ModuleList() self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) if self.is_decoder: @@ -680,9 +706,16 @@ def forward( attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + if self.ort: + # Remove data-based control flow for static graph + if hidden_states.dtype == torch.float16: + clamp_value = torch.where(torch.isinf(hidden_states).any(), torch.finfo(hidden_states.dtype).max - 1000, + torch.finfo(hidden_states.dtype).max) + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + else: + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) do_cross_attention = self.is_decoder and encoder_hidden_states is not None if do_cross_attention: @@ -707,9 +740,16 @@ def forward( hidden_states = cross_attention_outputs[0] # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + if self.ort: + # Remove data-based control flow for static graph + if hidden_states.dtype == torch.float16: + clamp_value = torch.where(torch.isinf(hidden_states).any(), torch.finfo(hidden_states.dtype).max - 1000, + torch.finfo(hidden_states.dtype).max) + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + else: + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) # Combine self attn and cross attn key value states if present_key_value_state is not None: @@ -722,9 +762,16 @@ def forward( hidden_states = self.layer[-1](hidden_states) # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + if self.ort: + # Remove data-based control flow for static graph + if hidden_states.dtype == torch.float16: + clamp_value = torch.where(torch.isinf(hidden_states).any(), torch.finfo(hidden_states.dtype).max - 1000, + torch.finfo(hidden_states.dtype).max) + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + else: + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) outputs = (hidden_states,) @@ -843,7 +890,7 @@ def __init__(self, config, embed_tokens=None): [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] ) self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) + self.dropout = T5ClampedDropout(config) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 1a8cac0722e8..32c51927db21 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1068,7 +1068,12 @@ def _wrap_model(self, model, training=True): # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again if unwrap_model(model) is not model: - return model + if self.args.ort: + from torch_ort import ORTModule + if type(model) is not ORTModule: + return model + else: + return model # Mixed precision training with apex (torch < 1.6) if self.use_apex and training: @@ -1255,7 +1260,14 @@ def train( delay_optimizer_creation = ( self.sharded_ddp is not None and self.sharded_ddp != ShardedDDPOption.SIMPLE or is_sagemaker_mp_enabled() ) + if args.ort: + from torch_ort import ORTModule + logger.info("Converting to ORTModule ....") + model = ORTModule(self.model) + self.model_wrapped = model if args.deepspeed: + if args.ort: + self.model = model deepspeed_engine, optimizer, lr_scheduler = deepspeed_init( self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint ) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index cc0a5ec83570..dc46637436ad 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -335,6 +335,8 @@ class TrainingArguments: Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may evolve in the future. The value is either the location of DeepSpeed json config file (e.g., `ds_config.json`) or an already loaded json file as a `dict`" + ortmodule (:obj:`bool`, `optional`): + Use `ORTModule `__. label_smoothing_factor (`float`, *optional*, defaults to 0.0): The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded labels are changed from 0s and 1s to `label_smoothing_factor/num_labels` and `1 - label_smoothing_factor + @@ -684,6 +686,10 @@ class TrainingArguments: "help": "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already loaded json file as a dict" }, ) + ort: Optional[bool] = field( + default=False, + metadata={"help": "Enable Ort"}, + ) label_smoothing_factor: float = field( default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."} )