diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 477ccff95052..ec3ae674300d 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -329,6 +329,7 @@ def main():
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
         "use_auth_token": True if model_args.use_auth_token else None,
+        "ort": True if training_args.ort else None,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 242e83427389..f18a935d3917 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -299,6 +299,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
+        ort = training_args.ort,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index c35b636d7dd9..322e6c45f2a8 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -376,6 +376,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
+        ort=True if training_args.ort else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index b15a0378ca7d..226a48fdd1e1 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -337,6 +337,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
+        ort=True if training_args.ort else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 6f2630104f7e..af5697d0cb8a 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -341,6 +341,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
+        ort=True if training_args.ort else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index f66b5734bd98..c9a05d38ad0a 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -236,6 +236,10 @@ class PretrainedConfig(PushToHubMixin):
 
         use_bfloat16 (`bool`, *optional*, defaults to `False`):
             Whether or not the model should use BFloat16 scalars (only used by some TensorFlow models).
+
+    Onnxruntime specific parameters
+
+        - **ort** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should use ORT.
     """
     model_type: str = ""
     is_composition: bool = False
@@ -260,6 +264,7 @@ def __init__(self, **kwargs):
         self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
         self.torch_dtype = kwargs.pop("torch_dtype", None)  # Only used by PyTorch models
         self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
+        self.ort = kwargs.pop("ort", False)
         self.pruned_heads = kwargs.pop("pruned_heads", {})
         self.tie_word_embeddings = kwargs.pop(
             "tie_word_embeddings", True
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index c779267b7b38..996305081ed2 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -35,6 +35,7 @@
 from ...pytorch_utils import softmax_backward_data
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta_v2 import DebertaV2Config
+from .jit_tracing import traceable
 
 
 logger = logging.get_logger(__name__)
@@ -56,7 +57,10 @@ class ContextPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
-        self.dropout = StableDropout(config.pooler_dropout)
+        if config.ort:
+            self.dropout = TorchNNDropout(config.pooler_dropout)
+        else:
+            self.dropout = StableDropout(config.pooler_dropout)
         self.config = config
 
     def forward(self, hidden_states):
@@ -74,6 +78,7 @@ def output_dim(self):
         return self.config.hidden_size
 
 
+@traceable
 # Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2
 class XSoftmax(torch.autograd.Function):
     """
@@ -164,7 +169,7 @@ def get_mask(input, local_context):
 
     return mask, dropout
 
-
+@traceable
 # Copied from transformers.models.deberta.modeling_deberta.XDropout
 class XDropout(torch.autograd.Function):
     """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
@@ -187,6 +192,9 @@ def backward(ctx, grad_output):
         else:
             return grad_output, None
 
+class TorchNNDropout(torch.nn.Dropout):
+    def __init__(self, drop_prob):
+        super().__init__(drop_prob)
 
 # Copied from transformers.models.deberta.modeling_deberta.StableDropout
 class StableDropout(nn.Module):
@@ -244,7 +252,10 @@ def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
-        self.dropout = StableDropout(config.hidden_dropout_prob)
+        if config.ort:
+            self.dropout = TorchNNDropout(config.hidden_dropout_prob)
+        else:
+            self.dropout = StableDropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states, input_tensor):
         hidden_states = self.dense(hidden_states)
@@ -312,7 +323,10 @@ def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
         self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
-        self.dropout = StableDropout(config.hidden_dropout_prob)
+        if config.ort:
+            self.dropout = TorchNNDropout(config.hidden_dropout_prob)
+        else:
+            self.dropout = StableDropout(config.hidden_dropout_prob)
         self.config = config
 
     def forward(self, hidden_states, input_tensor):
@@ -367,7 +381,10 @@ def __init__(self, config):
             config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups
         )
         self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
-        self.dropout = StableDropout(config.hidden_dropout_prob)
+        if config.ort:
+            self.dropout = TorchNNDropout(config.hidden_dropout_prob)
+        else:
+            self.dropout = StableDropout(config.hidden_dropout_prob)
         self.config = config
 
     def forward(self, hidden_states, residual_states, input_mask):
@@ -624,8 +641,10 @@ def __init__(self, config):
             self.pos_ebd_size = self.max_relative_positions
             if self.position_buckets > 0:
                 self.pos_ebd_size = self.position_buckets
-
-            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
+            if config.ort:
+                self.pos_dropout = TorchNNDropout(config.hidden_dropout_prob)
+            else:
+                self.pos_dropout = StableDropout(config.hidden_dropout_prob)
 
             if not self.share_att_key:
                 if "c2p" in self.pos_att_type:
@@ -633,7 +652,10 @@ def __init__(self, config):
                 if "p2c" in self.pos_att_type:
                     self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size)
 
-        self.dropout = StableDropout(config.attention_probs_dropout_prob)
+        if config.ort:
+            self.dropout = TorchNNDropout(config.attention_probs_dropout_prob)
+        else:
+            self.dropout = StableDropout(config.attention_probs_dropout_prob)
 
     def transpose_for_scores(self, x, attention_heads):
         new_x_shape = x.size()[:-1] + (attention_heads, -1)
@@ -824,7 +846,10 @@ def __init__(self, config):
         if self.embedding_size != config.hidden_size:
             self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
         self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
-        self.dropout = StableDropout(config.hidden_dropout_prob)
+        if config.ort:
+            self.dropout = TorchNNDropout(config.hidden_dropout_prob)
+        else:
+            self.dropout = StableDropout(config.hidden_dropout_prob)
         self.config = config
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
@@ -1237,7 +1262,10 @@ def __init__(self, config):
         self.classifier = nn.Linear(output_dim, num_labels)
         drop_out = getattr(config, "cls_dropout", None)
         drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
-        self.dropout = StableDropout(drop_out)
+        if config.ort:
+            self.dropout = TorchNNDropout(drop_out)
+        else:
+            self.dropout = StableDropout(drop_out)
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index 3b5f6a9a6ba3..f792906a116c 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -1467,6 +1467,7 @@ class RobertaForQuestionAnswering(RobertaPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.ort = config.ort
 
         self.roberta = RobertaModel(config, add_pooling_layer=False)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
@@ -1536,7 +1537,7 @@ def forward(
             if len(end_positions.size()) > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
+            ignored_index = start_logits.size(1) if not self.ort else 344
             start_positions = start_positions.clamp(0, ignored_index)
             end_positions = end_positions.clamp(0, ignored_index)
 
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 630e9dd17aa5..6091f3e6cc4b 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -276,12 +276,37 @@ def forward(self, hidden_states):
     pass
 
 
+class T5ClampedDropout(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ort = config.ort
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.dropout_rate = config.dropout_rate
+
+    def forward(self, hidden_states):
+        # clamp inf values to enable fp16 training
+        if self.ort:
+            # Remove data-based control flow for static graph
+            if hidden_states.dtype == torch.float16:
+                clamp_value = torch.where(torch.isinf(hidden_states).any(), torch.finfo(hidden_states.dtype).max - 1000,
+                    torch.finfo(hidden_states.dtype).max)
+                clamp_value = (1.0-self.dropout_rate)*clamp_value
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        else:
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
 class T5DenseReluDense(nn.Module):
     def __init__(self, config: T5Config):
         super().__init__()
         self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
         self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
+        self.dropout = T5ClampedDropout(config)
         self.relu_act = ACT2FN["relu"]
 
     def forward(self, hidden_states):
@@ -298,7 +323,7 @@ def __init__(self, config: T5Config):
         self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
         self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
         self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
+        self.dropout = T5ClampedDropout(config)
         self.gelu_act = ACT2FN["gelu_new"]
 
     def forward(self, hidden_states):
@@ -323,7 +348,7 @@ def __init__(self, config: T5Config):
             )
 
         self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
+        self.dropout = T5ClampedDropout(config)
 
     def forward(self, hidden_states):
         forwarded_states = self.layer_norm(hidden_states)
@@ -560,7 +585,7 @@ def __init__(self, config, has_relative_attention_bias=False):
         super().__init__()
         self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
         self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
+        self.dropout = T5ClampedDropout(config)
 
     def forward(
         self,
@@ -592,7 +617,7 @@ def __init__(self, config):
         super().__init__()
         self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False)
         self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
+        self.dropout = T5ClampedDropout(config)
 
     def forward(
         self,
@@ -627,6 +652,7 @@ class T5Block(nn.Module):
     def __init__(self, config, has_relative_attention_bias=False):
         super().__init__()
         self.is_decoder = config.is_decoder
+        self.ort = config.ort
         self.layer = nn.ModuleList()
         self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
         if self.is_decoder:
@@ -680,9 +706,16 @@ def forward(
         attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
 
         # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        if self.ort:
+            # Remove data-based control flow for static graph
+            if hidden_states.dtype == torch.float16:
+                clamp_value = torch.where(torch.isinf(hidden_states).any(), torch.finfo(hidden_states.dtype).max - 1000,
+                    torch.finfo(hidden_states.dtype).max)
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        else:
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
         do_cross_attention = self.is_decoder and encoder_hidden_states is not None
         if do_cross_attention:
@@ -707,9 +740,16 @@ def forward(
             hidden_states = cross_attention_outputs[0]
 
             # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+            if self.ort:
+                # Remove data-based control flow for static graph
+                if hidden_states.dtype == torch.float16:
+                    clamp_value = torch.where(torch.isinf(hidden_states).any(), torch.finfo(hidden_states.dtype).max - 1000,
+                        torch.finfo(hidden_states.dtype).max)
+                    hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+            else:
+                if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                    clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                    hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
             # Combine self attn and cross attn key value states
             if present_key_value_state is not None:
@@ -722,9 +762,16 @@ def forward(
         hidden_states = self.layer[-1](hidden_states)
 
         # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        if self.ort:
+            # Remove data-based control flow for static graph
+            if hidden_states.dtype == torch.float16:
+                clamp_value = torch.where(torch.isinf(hidden_states).any(), torch.finfo(hidden_states.dtype).max - 1000,
+                    torch.finfo(hidden_states.dtype).max)
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        else:
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
         outputs = (hidden_states,)
 
@@ -843,7 +890,7 @@ def __init__(self, config, embed_tokens=None):
             [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
         )
         self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
+        self.dropout = T5ClampedDropout(config)
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 1a8cac0722e8..32c51927db21 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1068,7 +1068,12 @@ def _wrap_model(self, model, training=True):
 
         # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again
         if unwrap_model(model) is not model:
-            return model
+            if self.args.ort:
+                from torch_ort import ORTModule
+                if type(model) is not ORTModule:
+                    return model
+            else:
+                return model
 
         # Mixed precision training with apex (torch < 1.6)
         if self.use_apex and training:
@@ -1255,7 +1260,14 @@ def train(
         delay_optimizer_creation = (
             self.sharded_ddp is not None and self.sharded_ddp != ShardedDDPOption.SIMPLE or is_sagemaker_mp_enabled()
         )
+        if args.ort:
+            from torch_ort import ORTModule
+            logger.info("Converting to ORTModule ....")
+            model = ORTModule(self.model)
+            self.model_wrapped = model
         if args.deepspeed:
+            if args.ort:
+                self.model = model
             deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
                 self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint
             )
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index cc0a5ec83570..dc46637436ad 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -335,6 +335,8 @@ class TrainingArguments:
             Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may
             evolve in the future. The value is either the location of DeepSpeed json config file (e.g.,
             `ds_config.json`) or an already loaded json file as a `dict`"
+        ortmodule (:obj:`bool`, `optional`):
+            Use `ORTModule <https://github.com/microsoft/onnxruntime>`__.
         label_smoothing_factor (`float`, *optional*, defaults to 0.0):
             The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
             labels are changed from 0s and 1s to `label_smoothing_factor/num_labels` and `1 - label_smoothing_factor +
@@ -684,6 +686,10 @@ class TrainingArguments:
             "help": "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already loaded json file as a dict"
         },
     )
+    ort: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enable Ort"},
+    )
     label_smoothing_factor: float = field(
         default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."}
     )