huggingface · ArthurZucker · May 4, 2022 · May 4, 2022 · May 4, 2022 · May 4, 2022
diff --git a/docs/source/en/model_doc/opt.mdx b/docs/source/en/model_doc/opt.mdx
@@ -45,3 +45,26 @@ The original code can be found [here](https://github.com/facebookresearch/metase
 [[autodoc]] OPTForCausalLM
     - forward
 
+## TFOPTModel
+
+[[autodoc]] TFOPTModel
+    - call
+
+## TFOPTPretrainedModel
+
+[[autodoc]] TFOPTPretrainedModel
+    - call
+
+
+## FlaxOPTModel
+
+[[autodoc]] FlaxOPTModel
+    - __call__
+    - encode
+    - decode
+
+
+## FlaxOPTForCausalLM
+
+[[autodoc]] FlaxOPTForCausalLM
+    - __call__
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -2147,6 +2147,7 @@
             "TFOpenAIGPTPreTrainedModel",
         ]
     )
+    _import_structure["models.opt"].extend(["TFOPTModel", "TFOPTPretrainedModel"])
     _import_structure["models.pegasus"].extend(
         ["TFPegasusForConditionalGeneration", "TFPegasusModel", "TFPegasusPreTrainedModel"]
     )
@@ -2485,6 +2486,14 @@
         ]
     )
     _import_structure["models.mt5"].extend(["FlaxMT5ForConditionalGeneration", "FlaxMT5Model"])
+    _import_structure["models.opt"].extend(
+        [
+            "FlaxOPTDecoderPreTrainedModel",
+            "FlaxOPTForCausalLM",
+            "FlaxOPTModel",
+            "FlaxOPTPreTrainedModel",
+        ]
+    )
     _import_structure["models.pegasus"].extend(
         [
             "FlaxPegasusForConditionalGeneration",
@@ -4319,6 +4328,7 @@
             TFOpenAIGPTModel,
             TFOpenAIGPTPreTrainedModel,
         )
+        from .models.opt import TFOPTModel, TFOPTPretrainedModel
         from .models.pegasus import TFPegasusForConditionalGeneration, TFPegasusModel, TFPegasusPreTrainedModel
         from .models.rag import TFRagModel, TFRagPreTrainedModel, TFRagSequenceForGeneration, TFRagTokenForGeneration
         from .models.rembert import (
@@ -4581,6 +4591,7 @@
             FlaxMBartPreTrainedModel,
         )
         from .models.mt5 import FlaxMT5ForConditionalGeneration, FlaxMT5Model
+        from .models.opt import FlaxOPTDecoderPreTrainedModel, FlaxOPTForCausalLM, FlaxOPTModel, FlaxOPTPreTrainedModel
         from .models.pegasus import FlaxPegasusForConditionalGeneration, FlaxPegasusModel, FlaxPegasusPreTrainedModel
         from .models.roberta import (
             FlaxRobertaForCausalLM,

diff --git a/src/transformers/modeling_flax_outputs.py b/src/transformers/modeling_flax_outputs.py
@@ -302,6 +302,38 @@ class FlaxCausalLMOutputWithCrossAttentions(ModelOutput):
     attentions: Optional[Tuple[jnp.ndarray]] = None
     cross_attentions: Optional[Tuple[jnp.ndarray]] = None
 
+@flax.struct.dataclass
+class FlaxCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `jnp.ndarray` tuples of length `config.n_layers`, with each tuple containing the cached key, value
+            states of the self-attention and the cross-attention layers if model is used in encoder-decoder setting.
+            Only relevant if `config.is_decoder = True`.
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+    """
+
+    logits: jnp.ndarray = None
+    past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None
+    hidden_states: Optional[Tuple[jnp.ndarray]] = None
+    attentions: Optional[Tuple[jnp.ndarray]] = None
 
 @flax.struct.dataclass
 class FlaxMaskedLMOutput(ModelOutput):

diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
@@ -40,6 +40,7 @@
         ("beit", "FlaxBeitModel"),
         ("big_bird", "FlaxBigBirdModel"),
         ("bart", "FlaxBartModel"),
+        ("opt", "FlaxOPTModel"),
         ("gpt2", "FlaxGPT2Model"),
         ("gpt_neo", "FlaxGPTNeoModel"),
         ("gptj", "FlaxGPTJModel"),
@@ -127,6 +128,7 @@
         ("gptj", "FlaxGPTJForCausalLM"),
         ("xglm", "FlaxXGLMForCausalLM"),
         ("bart", "FlaxBartForCausalLM"),
+        ("opt", "FlaxOPTForCausalLM"),
         ("bert", "FlaxBertForCausalLM"),
         ("roberta", "FlaxRobertaForCausalLM"),
         ("big_bird", "FlaxBigBirdForCausalLM"),

diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
@@ -45,6 +45,7 @@
         ("distilbert", "TFDistilBertModel"),
         ("albert", "TFAlbertModel"),
         ("bart", "TFBartModel"),
+        ("opt", "TFOPTModel"),
         ("camembert", "TFCamembertModel"),
         ("xlm-roberta", "TFXLMRobertaModel"),
         ("longformer", "TFLongformerModel"),

diff --git a/src/transformers/models/opt/__init__.py b/src/transformers/models/opt/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available, is_tf_available, is_flax_available
 
 
 _import_structure = {
@@ -33,13 +33,34 @@
         "OPTPreTrainedModel",
     ]
 
+if is_tf_available():
+    _import_structure["modeling_tf_opt"] = ["TFOPTModel", "TFOPTPretrainedModel"]
+
+if is_flax_available():
+    _import_structure["modeling_flax_opt"] = [
+        "FlaxOPTDecoderPreTrainedModel",
+        "FlaxOPTForCausalLM",
+        "FlaxOPTModel",
+        "FlaxOPTPreTrainedModel",
+    ]
 
 if TYPE_CHECKING:
     from .configuration_opt import OPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OPTConfig
 
     if is_torch_available():
         from .modeling_opt import OPT_PRETRAINED_MODEL_ARCHIVE_LIST, OPTForCausalLM, OPTModel, OPTPreTrainedModel
 
+    if is_tf_available():
+        from .modeling_tf_opt import TFOPTModel, TFOPTPretrainedModel
+
+    if is_flax_available():
+        from .modeling_flax_opt import (
+            FlaxOPTDecoderPreTrainedModel,
+            FlaxOPTForCausalLM,
+            FlaxOPTModel,
+            FlaxOPTPreTrainedModel,
+        )
+
 else:
     import sys