diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 17c04376780a..ed04cad3dd9b 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -38,7 +38,7 @@ The documentation is organized in five parts:
- **GET STARTED** contains a quick tour and installation instructions to get up and running with 🤗 Transformers.
- **TUTORIALS** are a great place to begin if you are new to our library. This section will help you gain the basic skills you need to start using 🤗 Transformers.
- **HOW-TO GUIDES** will show you how to achieve a specific goal like fine-tuning a pretrained model for language modeling or how to create a custom model head.
-- **CONCEPTUAL GUIDES** provides more discussion and explanation of the underlying concepts and ideas behind models, tasks, and the design philosophy of 🤗 Transformers.
+- **CONCEPTUAL GUIDES** provides more discussion and explanation of the underlying concepts and ideas behind models, tasks, and the design philosophy of 🤗 Transformers.
- **API** describes each class and function, grouped in:
- **MAIN CLASSES** for the main classes exposing the important APIs of the library.
@@ -245,7 +245,7 @@ Flax), PyTorch, and/or TensorFlow.
| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ |
| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ |
| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ |
-| LayoutLMv3 | ✅ | ✅ | ✅ | ❌ | ❌ |
+| LayoutLMv3 | ✅ | ✅ | ✅ | ✅ | ❌ |
| LED | ✅ | ✅ | ✅ | ✅ | ❌ |
| LeViT | ❌ | ❌ | ✅ | ❌ | ❌ |
| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ |
diff --git a/docs/source/en/model_doc/layoutlmv3.mdx b/docs/source/en/model_doc/layoutlmv3.mdx
index 8f115cf96ea5..37fb0dc30446 100644
--- a/docs/source/en/model_doc/layoutlmv3.mdx
+++ b/docs/source/en/model_doc/layoutlmv3.mdx
@@ -26,18 +26,18 @@ Tips:
- In terms of data processing, LayoutLMv3 is identical to its predecessor [LayoutLMv2](layoutlmv2), except that:
- images need to be resized and normalized with channels in regular RGB format. LayoutLMv2 on the other hand normalizes the images internally and expects the channels in BGR format.
- - text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece.
+ - text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece.
Due to these differences in data preprocessing, one can use [`LayoutLMv3Processor`] which internally combines a [`LayoutLMv3FeatureExtractor`] (for the image modality) and a [`LayoutLMv3Tokenizer`]/[`LayoutLMv3TokenizerFast`] (for the text modality) to prepare all data for the model.
-- Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-layoutlmv2processor) of its predecessor.
+- Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-layoutlmv2processor) of its predecessor.
- Demo notebooks for LayoutLMv3 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LayoutLMv3).
- Demo scripts can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/layoutlmv3).
+alt="drawing" width="600"/>
LayoutLMv3 architecture. Taken from the original paper.
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/layoutlmv3).
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version of this model was added by [chriskoo](https://huggingface.co/chriskoo), [tokec](https://huggingface.co/tokec), and [lre](https://huggingface.co/lre). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/layoutlmv3).
## LayoutLMv3Config
@@ -84,3 +84,23 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi
[[autodoc]] LayoutLMv3ForQuestionAnswering
- forward
+
+## TFLayoutLMv3Model
+
+[[autodoc]] TFLayoutLMv3Model
+ - call
+
+## TFLayoutLMv3ForSequenceClassification
+
+[[autodoc]] TFLayoutLMv3ForSequenceClassification
+ - call
+
+## TFLayoutLMv3ForTokenClassification
+
+[[autodoc]] TFLayoutLMv3ForTokenClassification
+ - call
+
+## TFLayoutLMv3ForQuestionAnswering
+
+[[autodoc]] TFLayoutLMv3ForQuestionAnswering
+ - call
diff --git a/docs/source/it/index.mdx b/docs/source/it/index.mdx
index d5e10b7c4983..3ee8da15ed2d 100644
--- a/docs/source/it/index.mdx
+++ b/docs/source/it/index.mdx
@@ -221,7 +221,7 @@ tokenizer (chiamato "slow"). Un tokenizer "fast" supportato dalla libreria 🤗
| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ |
| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ |
| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ |
-| LayoutLMv3 | ✅ | ✅ | ✅ | ❌ | ❌ |
+| LayoutLMv3 | ✅ | ✅ | ✅ | ✅ | ❌ |
| LED | ✅ | ✅ | ✅ | ✅ | ❌ |
| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ |
| LUKE | ✅ | ❌ | ✅ | ❌ | ❌ |
@@ -288,4 +288,4 @@ tokenizer (chiamato "slow"). Un tokenizer "fast" supportato dalla libreria 🤗
| YOLOS | ❌ | ❌ | ✅ | ❌ | ❌ |
| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ |
-
\ No newline at end of file
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3281d266a2f3..bb64fe9295da 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2343,6 +2343,16 @@
"TFLayoutLMPreTrainedModel",
]
)
+ _import_structure["models.layoutlmv3"].extend(
+ [
+ "TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST",
+ "TFLayoutLMv3ForQuestionAnswering",
+ "TFLayoutLMv3ForSequenceClassification",
+ "TFLayoutLMv3ForTokenClassification",
+ "TFLayoutLMv3Model",
+ "TFLayoutLMv3PreTrainedModel",
+ ]
+ )
_import_structure["models.led"].extend(["TFLEDForConditionalGeneration", "TFLEDModel", "TFLEDPreTrainedModel"])
_import_structure["models.longformer"].extend(
[
@@ -4801,6 +4811,14 @@
TFHubertModel,
TFHubertPreTrainedModel,
)
+ from .models.layoutlmv3 import (
+ TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST,
+ TFLayoutLMv3ForQuestionAnswering,
+ TFLayoutLMv3ForSequenceClassification,
+ TFLayoutLMv3ForTokenClassification,
+ TFLayoutLMv3Model,
+ TFLayoutLMv3PreTrainedModel,
+ )
from .models.led import TFLEDForConditionalGeneration, TFLEDModel, TFLEDPreTrainedModel
from .models.longformer import (
TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 359e1f05c47b..991bb79a6b37 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -52,6 +52,7 @@
("gptj", "TFGPTJModel"),
("hubert", "TFHubertModel"),
("layoutlm", "TFLayoutLMModel"),
+ ("layoutlmv3", "TFLayoutLMv3Model"),
("led", "TFLEDModel"),
("longformer", "TFLongformerModel"),
("lxmert", "TFLxmertModel"),
@@ -268,6 +269,7 @@
("gpt2", "TFGPT2ForSequenceClassification"),
("gptj", "TFGPTJForSequenceClassification"),
("layoutlm", "TFLayoutLMForSequenceClassification"),
+ ("layoutlmv3", "TFLayoutLMv3ForSequenceClassification"),
("longformer", "TFLongformerForSequenceClassification"),
("mobilebert", "TFMobileBertForSequenceClassification"),
("mpnet", "TFMPNetForSequenceClassification"),
@@ -297,6 +299,7 @@
("flaubert", "TFFlaubertForQuestionAnsweringSimple"),
("funnel", "TFFunnelForQuestionAnswering"),
("gptj", "TFGPTJForQuestionAnswering"),
+ ("layoutlmv3", "TFLayoutLMv3ForQuestionAnswering"),
("longformer", "TFLongformerForQuestionAnswering"),
("mobilebert", "TFMobileBertForQuestionAnswering"),
("mpnet", "TFMPNetForQuestionAnswering"),
@@ -316,7 +319,6 @@
]
)
-
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[
# Model for Token Classification mapping
@@ -331,6 +333,7 @@
("flaubert", "TFFlaubertForTokenClassification"),
("funnel", "TFFunnelForTokenClassification"),
("layoutlm", "TFLayoutLMForTokenClassification"),
+ ("layoutlmv3", "TFLayoutLMv3ForTokenClassification"),
("longformer", "TFLongformerForTokenClassification"),
("mobilebert", "TFMobileBertForTokenClassification"),
("mpnet", "TFMPNetForTokenClassification"),
@@ -373,7 +376,6 @@
]
)
-
TF_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_MAPPING_NAMES)
TF_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
TF_MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES)
diff --git a/src/transformers/models/layoutlmv3/__init__.py b/src/transformers/models/layoutlmv3/__init__.py
index cfa26057e87b..68a07362dc41 100644
--- a/src/transformers/models/layoutlmv3/__init__.py
+++ b/src/transformers/models/layoutlmv3/__init__.py
@@ -21,6 +21,7 @@
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
+ is_tf_available,
is_tokenizers_available,
is_torch_available,
is_vision_available,
@@ -60,6 +61,21 @@
"LayoutLMv3PreTrainedModel",
]
+try:
+ if not is_tf_available():
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ pass
+else:
+ _import_structure["modeling_tf_layoutlmv3"] = [
+ "TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST",
+ "TFLayoutLMv3ForQuestionAnswering",
+ "TFLayoutLMv3ForSequenceClassification",
+ "TFLayoutLMv3ForTokenClassification",
+ "TFLayoutLMv3Model",
+ "TFLayoutLMv3PreTrainedModel",
+ ]
+
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
@@ -101,6 +117,21 @@
LayoutLMv3PreTrainedModel,
)
+ try:
+ if not is_tf_available():
+ raise OptionalDependencyNotAvailable()
+ except OptionalDependencyNotAvailable:
+ pass
+ else:
+ from .modeling_tf_layoutlmv3 import (
+ TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST,
+ TFLayoutLMv3ForQuestionAnswering,
+ TFLayoutLMv3ForSequenceClassification,
+ TFLayoutLMv3ForTokenClassification,
+ TFLayoutLMv3Model,
+ TFLayoutLMv3PreTrainedModel,
+ )
+
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
new file mode 100644
index 000000000000..85a44e4ff52a
--- /dev/null
+++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
@@ -0,0 +1,1610 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 LayoutLMv3 model."""
+
+import collections
+import math
+from typing import Dict, List, Optional, Tuple, Union
+
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+ TFBaseModelOutput,
+ TFQuestionAnsweringModelOutput,
+ TFSequenceClassifierOutput,
+ TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+ TFPreTrainedModel,
+ TFQuestionAnsweringLoss,
+ TFSequenceClassificationLoss,
+ TFTokenClassificationLoss,
+ get_initializer,
+ keras_serializable,
+ unpack_inputs,
+)
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from .configuration_layoutlmv3 import LayoutLMv3Config
+
+
+_CONFIG_FOR_DOC = "LayoutLMv3Config"
+
+_DUMMY_INPUT_IDS = [
+ [7, 6, 1],
+ [1, 2, 0],
+]
+
+_DUMMY_BBOX = [
+ [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
+ [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]],
+]
+
+TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST = [
+ "microsoft/layoutlmv3-base",
+ "microsoft/layoutlmv3-large",
+ # See all LayoutLMv3 models at https://huggingface.co/models?filter=layoutlmv3
+]
+
+LARGE_NEGATIVE = -1e8
+
+
+class TFLayoutLMv3PatchEmbeddings(tf.keras.layers.Layer):
+ """LayoutLMv3 image (patch) embeddings."""
+
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(**kwargs)
+ patch_sizes = (
+ config.patch_size
+ if isinstance(config.patch_size, collections.abc.Iterable)
+ else (config.patch_size, config.patch_size)
+ )
+ self.proj = tf.keras.layers.Conv2D(
+ filters=config.hidden_size,
+ kernel_size=patch_sizes,
+ strides=patch_sizes,
+ padding="valid",
+ data_format="channels_last",
+ use_bias=True,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="proj",
+ )
+ self.hidden_size = config.hidden_size
+ self.num_patches = (config.input_size**2) // (patch_sizes[0] * patch_sizes[1])
+
+ def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
+ # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
+ # So change the input format from `NCHW` to `NHWC`.
+ pixel_values = tf.transpose(pixel_values, perm=[0, 2, 3, 1])
+
+ embeddings = self.proj(pixel_values)
+ embeddings = tf.reshape(embeddings, (-1, self.num_patches, self.hidden_size))
+ return embeddings
+
+
+class TFLayoutLMv3TextEmbeddings(tf.keras.layers.Layer):
+ """
+ LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings.
+ """
+
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(**kwargs)
+ self.word_embeddings = tf.keras.layers.Embedding(
+ config.vocab_size,
+ config.hidden_size,
+ embeddings_initializer=get_initializer(config.initializer_range),
+ name="word_embeddings",
+ )
+ self.token_type_embeddings = tf.keras.layers.Embedding(
+ config.type_vocab_size,
+ config.hidden_size,
+ embeddings_initializer=get_initializer(config.initializer_range),
+ name="token_type_embeddings",
+ )
+ self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+ self.padding_token_index = config.pad_token_id
+ self.position_embeddings = tf.keras.layers.Embedding(
+ config.max_position_embeddings,
+ config.hidden_size,
+ embeddings_initializer=get_initializer(config.initializer_range),
+ name="position_embeddings",
+ )
+ self.x_position_embeddings = tf.keras.layers.Embedding(
+ config.max_2d_position_embeddings,
+ config.coordinate_size,
+ embeddings_initializer=get_initializer(config.initializer_range),
+ name="x_position_embeddings",
+ )
+ self.y_position_embeddings = tf.keras.layers.Embedding(
+ config.max_2d_position_embeddings,
+ config.coordinate_size,
+ embeddings_initializer=get_initializer(config.initializer_range),
+ name="y_position_embeddings",
+ )
+ self.h_position_embeddings = tf.keras.layers.Embedding(
+ config.max_2d_position_embeddings,
+ config.shape_size,
+ embeddings_initializer=get_initializer(config.initializer_range),
+ name="h_position_embeddings",
+ )
+ self.w_position_embeddings = tf.keras.layers.Embedding(
+ config.max_2d_position_embeddings,
+ config.shape_size,
+ embeddings_initializer=get_initializer(config.initializer_range),
+ name="w_position_embeddings",
+ )
+ self.max_2d_positions = config.max_2d_position_embeddings
+
+ def calculate_spatial_position_embeddings(self, bbox: tf.Tensor) -> tf.Tensor:
+ try:
+ left_position_ids = bbox[:, :, 0]
+ upper_position_ids = bbox[:, :, 1]
+ right_position_ids = bbox[:, :, 2]
+ lower_position_ids = bbox[:, :, 3]
+ except IndexError as exception:
+ raise IndexError("Bounding box is not of shape (batch_size, seq_length, 4).") from exception
+
+ try:
+ left_position_embeddings = self.x_position_embeddings(left_position_ids)
+ upper_position_embeddings = self.y_position_embeddings(upper_position_ids)
+ right_position_embeddings = self.x_position_embeddings(right_position_ids)
+ lower_position_embeddings = self.y_position_embeddings(lower_position_ids)
+ except IndexError as exception:
+ raise IndexError(
+ f"The `bbox` coordinate values should be within 0-{self.max_2d_positions} range."
+ ) from exception
+
+ max_position_id = self.max_2d_positions - 1
+ h_position_embeddings = self.h_position_embeddings(
+ tf.clip_by_value(bbox[:, :, 3] - bbox[:, :, 1], 0, max_position_id)
+ )
+ w_position_embeddings = self.w_position_embeddings(
+ tf.clip_by_value(bbox[:, :, 2] - bbox[:, :, 0], 0, max_position_id)
+ )
+
+ # LayoutLMv1 sums the spatial embeddings, but LayoutLMv3 concatenates them.
+ spatial_position_embeddings = tf.concat(
+ [
+ left_position_embeddings,
+ upper_position_embeddings,
+ right_position_embeddings,
+ lower_position_embeddings,
+ h_position_embeddings,
+ w_position_embeddings,
+ ],
+ axis=-1,
+ )
+ return spatial_position_embeddings
+
+ def create_position_ids_from_inputs_embeds(self, inputs_embds: tf.Tensor) -> tf.Tensor:
+ """
+ We are provided embeddings directly. We cannot infer which are padded, so just generate sequential position
+ ids.
+ """
+ input_shape = tf.shape(inputs_embds)
+ sequence_length = input_shape[1]
+ start_index = self.padding_token_index + 1
+ end_index = self.padding_token_index + sequence_length + 1
+ position_ids = tf.range(start_index, end_index, dtype=tf.int32)
+ batch_size = input_shape[0]
+ position_ids = tf.reshape(position_ids, (1, sequence_length))
+ position_ids = tf.tile(position_ids, (batch_size, 1))
+ return position_ids
+
+ def create_position_ids_from_input_ids(self, input_ids: tf.Tensor) -> tf.Tensor:
+ """
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_token_index + 1.
+ """
+ mask = tf.cast(tf.not_equal(input_ids, self.padding_token_index), input_ids.dtype)
+ position_ids = tf.cumsum(mask, axis=1) * mask
+ position_ids = position_ids + self.padding_token_index
+ return position_ids
+
+ def create_position_ids(self, input_ids: tf.Tensor, inputs_embeds: tf.Tensor) -> tf.Tensor:
+ if input_ids is None:
+ return self.create_position_ids_from_inputs_embeds(inputs_embeds)
+ else:
+ return self.create_position_ids_from_input_ids(input_ids)
+
+ def call(
+ self,
+ input_ids: Optional[tf.Tensor] = None,
+ bbox: tf.Tensor = None,
+ token_type_ids: Optional[tf.Tensor] = None,
+ position_ids: Optional[tf.Tensor] = None,
+ inputs_embeds: Optional[tf.Tensor] = None,
+ training: bool = False,
+ ) -> tf.Tensor:
+ if position_ids is None:
+ position_ids = self.create_position_ids(input_ids, inputs_embeds)
+
+ if input_ids is not None:
+ input_shape = tf.shape(input_ids)
+ else:
+ input_shape = tf.shape(inputs_embeds)[:-1]
+
+ if token_type_ids is None:
+ token_type_ids = tf.zeros(input_shape, dtype=position_ids.dtype)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.word_embeddings(input_ids)
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+ embeddings = inputs_embeds + token_type_embeddings
+ position_embeddings = self.position_embeddings(position_ids)
+ embeddings += position_embeddings
+
+ spatial_position_embeddings = self.calculate_spatial_position_embeddings(bbox)
+
+ embeddings += spatial_position_embeddings
+
+ embeddings = self.LayerNorm(embeddings)
+ embeddings = self.dropout(embeddings, training=training)
+ return embeddings
+
+
+class TFLayoutLMv3SelfAttention(tf.keras.layers.Layer):
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(**kwargs)
+ if config.hidden_size % config.num_attention_heads != 0:
+ raise ValueError(
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+ f"heads ({config.num_attention_heads})"
+ )
+
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+ self.attention_score_normaliser = math.sqrt(self.attention_head_size)
+
+ self.query = tf.keras.layers.Dense(
+ self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="query",
+ )
+ self.key = tf.keras.layers.Dense(
+ self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="key",
+ )
+ self.value = tf.keras.layers.Dense(
+ self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="value",
+ )
+
+ self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
+ self.has_relative_attention_bias = config.has_relative_attention_bias
+ self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+ def transpose_for_scores(self, x: tf.Tensor):
+ shape = tf.shape(x)
+ new_shape = (
+ shape[0], # batch_size
+ shape[1], # seq_length
+ self.num_attention_heads,
+ self.attention_head_size,
+ )
+ x = tf.reshape(x, new_shape)
+ return tf.transpose(x, perm=[0, 2, 1, 3]) # batch_size, num_heads, seq_length, attention_head_size
+
+ def cogview_attention(self, attention_scores: tf.Tensor, alpha: Union[float, int] = 32):
+ """
+ https://arxiv.org/abs/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation
+ (PB-Relax). A replacement of the original tf.keras.layers.Softmax(axis=-1)(attention_scores). Seems the new
+ attention_probs will result in a slower speed and a little bias. Can use
+ tf.debugging.assert_near(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison. The
+ smaller atol (e.g., 1e-08), the better.
+ """
+ scaled_attention_scores = attention_scores / alpha
+ max_value = tf.expand_dims(tf.reduce_max(scaled_attention_scores, axis=-1), axis=-1)
+ new_attention_scores = (scaled_attention_scores - max_value) * alpha
+ return tf.math.softmax(new_attention_scores, axis=-1)
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: Optional[tf.Tensor],
+ head_mask: Optional[tf.Tensor],
+ output_attentions: bool,
+ rel_pos: Optional[tf.Tensor] = None,
+ rel_2d_pos: Optional[tf.Tensor] = None,
+ training: bool = False,
+ ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+ query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ normalised_query_layer = query_layer / self.attention_score_normaliser
+ transposed_key_layer = tf.transpose(
+ key_layer, perm=[0, 1, 3, 2]
+ ) # batch_size, num_heads, attention_head_size, seq_length
+ attention_scores = tf.matmul(normalised_query_layer, transposed_key_layer)
+
+ if self.has_relative_attention_bias and self.has_spatial_attention_bias:
+ attention_scores += (rel_pos + rel_2d_pos) / self.attention_score_normaliser
+ elif self.has_relative_attention_bias:
+ attention_scores += rel_pos / self.attention_score_normaliser
+
+ if attention_mask is not None:
+ # Apply the attention mask (is precomputed for all layers in TFLayoutLMv3Model call() function)
+ attention_scores += attention_mask
+
+ # Normalize the attention scores to probabilities.
+ # Use the trick of CogView paper to stabilize training.
+ attention_probs = self.cogview_attention(attention_scores)
+
+ attention_probs = self.dropout(attention_probs, training=training)
+
+ # Mask heads if we want to.
+ if head_mask is not None:
+ attention_probs = attention_probs * head_mask
+
+ context_layer = tf.matmul(attention_probs, value_layer)
+ context_layer = tf.transpose(
+ context_layer, perm=[0, 2, 1, 3]
+ ) # batch_size, seq_length, num_heads, attention_head_size
+ shape = tf.shape(context_layer)
+ context_layer = tf.reshape(
+ context_layer, (shape[0], shape[1], self.all_head_size)
+ ) # batch_size, seq_length, num_heads * attention_head_size
+
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+ return outputs
+
+
+# Copied from models.roberta.modeling_tf_roberta.TFRobertaSelfOutput
+class TFLayoutLMv3SelfOutput(tf.keras.layers.Layer):
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = tf.keras.layers.Dense(
+ units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.dropout(inputs=hidden_states, training=training)
+ hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+ return hidden_states
+
+
+class TFLayoutLMv3Attention(tf.keras.layers.Layer):
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(**kwargs)
+ self.self_attention = TFLayoutLMv3SelfAttention(config, name="self")
+ self.self_output = TFLayoutLMv3SelfOutput(config, name="output")
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: Optional[tf.Tensor],
+ head_mask: Optional[tf.Tensor],
+ output_attentions: bool,
+ rel_pos: Optional[tf.Tensor] = None,
+ rel_2d_pos: Optional[tf.Tensor] = None,
+ training: bool = False,
+ ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
+ self_outputs = self.self_attention(
+ hidden_states,
+ attention_mask,
+ head_mask,
+ output_attentions,
+ rel_pos,
+ rel_2d_pos,
+ training=training,
+ )
+ attention_output = self.self_output(self_outputs[0], hidden_states, training=training)
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
+ return outputs
+
+
+# Copied from models.roberta.modeling_tf_bert.TFRobertaIntermediate
+class TFLayoutLMv3Intermediate(tf.keras.layers.Layer):
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = tf.keras.layers.Dense(
+ units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+
+ return hidden_states
+
+
+# Copied from models.roberta.modeling_tf_bert.TFRobertaOutput
+class TFLayoutLMv3Output(tf.keras.layers.Layer):
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = tf.keras.layers.Dense(
+ units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.dropout(inputs=hidden_states, training=training)
+ hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+ return hidden_states
+
+
+class TFLayoutLMv3Layer(tf.keras.layers.Layer):
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(**kwargs)
+ self.attention = TFLayoutLMv3Attention(config, name="attention")
+ self.intermediate = TFLayoutLMv3Intermediate(config, name="intermediate")
+ self.bert_output = TFLayoutLMv3Output(config, name="output")
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: Optional[tf.Tensor],
+ head_mask: Optional[tf.Tensor],
+ output_attentions: bool,
+ rel_pos: Optional[tf.Tensor] = None,
+ rel_2d_pos: Optional[tf.Tensor] = None,
+ training: bool = False,
+ ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
+ self_attention_outputs = self.attention(
+ hidden_states,
+ attention_mask,
+ head_mask,
+ output_attentions=output_attentions,
+ rel_pos=rel_pos,
+ rel_2d_pos=rel_2d_pos,
+ training=training,
+ )
+ attention_output = self_attention_outputs[0]
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
+ intermediate_output = self.intermediate(attention_output)
+ layer_output = self.bert_output(intermediate_output, attention_output, training=training)
+ outputs = (layer_output,) + outputs
+ return outputs
+
+
+class TFLayoutLMv3Encoder(tf.keras.layers.Layer):
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.layer = [TFLayoutLMv3Layer(config, name=f"layer.{i}") for i in range(config.num_hidden_layers)]
+
+ self.has_relative_attention_bias = config.has_relative_attention_bias
+ self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+ if self.has_relative_attention_bias:
+ self.rel_pos_bins = config.rel_pos_bins
+ self.max_rel_pos = config.max_rel_pos
+ self.rel_pos_bias = tf.keras.layers.Dense(
+ units=config.num_attention_heads,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=False,
+ name="rel_pos_bias",
+ )
+
+ if self.has_spatial_attention_bias:
+ self.max_rel_2d_pos = config.max_rel_2d_pos
+ self.rel_2d_pos_bins = config.rel_2d_pos_bins
+ self.rel_pos_x_bias = tf.keras.layers.Dense(
+ units=config.num_attention_heads,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=False,
+ name="rel_pos_x_bias",
+ )
+ self.rel_pos_y_bias = tf.keras.layers.Dense(
+ units=config.num_attention_heads,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=False,
+ name="rel_pos_y_bias",
+ )
+
+ def relative_position_bucket(self, relative_positions: tf.Tensor, num_buckets: int, max_distance: int):
+ # the negative relative positions are assigned to the interval [0, num_buckets / 2]
+ # we deal with this by assigning absolute relative positions to the interval [0, num_buckets / 2]
+ # and then offsetting the positive relative positions by num_buckets / 2 at the end
+ num_buckets = num_buckets // 2
+ buckets = tf.abs(relative_positions)
+
+ # half of the buckets are for exact increments in positions
+ max_exact_buckets = num_buckets // 2
+ is_small = buckets < max_exact_buckets
+
+ # the other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+ buckets_log_ratio = tf.math.log(tf.cast(buckets, tf.float32) / max_exact_buckets)
+ distance_log_ratio = math.log(max_distance / max_exact_buckets)
+ buckets_big_offset = (
+ buckets_log_ratio / distance_log_ratio * (num_buckets - max_exact_buckets)
+ ) # scale is [0, num_buckets - max_exact_buckets]
+ buckets_big = max_exact_buckets + buckets_big_offset # scale is [max_exact_buckets, num_buckets]
+ buckets_big = tf.cast(buckets_big, buckets.dtype)
+ buckets_big = tf.minimum(buckets_big, num_buckets - 1)
+
+ return (tf.cast(relative_positions > 0, buckets.dtype) * num_buckets) + tf.where(
+ is_small, buckets, buckets_big
+ )
+
+ def _cal_pos_emb(
+ self,
+ dense_layer: tf.keras.layers.Dense,
+ position_ids: tf.Tensor,
+ num_buckets: int,
+ max_distance: int,
+ ):
+ rel_pos_matrix = tf.expand_dims(position_ids, axis=-2) - tf.expand_dims(position_ids, axis=-1)
+ rel_pos = self.relative_position_bucket(rel_pos_matrix, num_buckets, max_distance)
+ rel_pos_one_hot = tf.one_hot(rel_pos, depth=num_buckets, dtype=self.compute_dtype)
+ embedding = dense_layer(rel_pos_one_hot)
+ # batch_size, seq_length, seq_length, num_heads --> batch_size, num_heads, seq_length, seq_length
+ embedding = tf.transpose(embedding, [0, 3, 1, 2])
+ embedding = tf.cast(embedding, dtype=self.compute_dtype)
+ return embedding
+
+ def _cal_1d_pos_emb(self, position_ids: tf.Tensor):
+ return self._cal_pos_emb(self.rel_pos_bias, position_ids, self.rel_pos_bins, self.max_rel_pos)
+
+ def _cal_2d_pos_emb(self, bbox: tf.Tensor):
+ position_coord_x = bbox[:, :, 0] # left
+ position_coord_y = bbox[:, :, 3] # bottom
+ rel_pos_x = self._cal_pos_emb(
+ self.rel_pos_x_bias,
+ position_coord_x,
+ self.rel_2d_pos_bins,
+ self.max_rel_2d_pos,
+ )
+ rel_pos_y = self._cal_pos_emb(
+ self.rel_pos_y_bias,
+ position_coord_y,
+ self.rel_2d_pos_bins,
+ self.max_rel_2d_pos,
+ )
+ rel_2d_pos = rel_pos_x + rel_pos_y
+ return rel_2d_pos
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ bbox: Optional[tf.Tensor] = None,
+ attention_mask: Optional[tf.Tensor] = None,
+ head_mask: Optional[tf.Tensor] = None,
+ output_attentions: bool = False,
+ output_hidden_states: bool = False,
+ return_dict: bool = True,
+ position_ids: Optional[tf.Tensor] = None,
+ training: bool = False,
+ ) -> Union[
+ TFBaseModelOutput,
+ Tuple[tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
+ ]:
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+
+ rel_pos = self._cal_1d_pos_emb(position_ids) if self.has_relative_attention_bias else None
+ rel_2d_pos = self._cal_2d_pos_emb(bbox) if self.has_spatial_attention_bias else None
+
+ for i, layer_module in enumerate(self.layer):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+
+ layer_outputs = layer_module(
+ hidden_states,
+ attention_mask,
+ layer_head_mask,
+ output_attentions,
+ rel_pos=rel_pos,
+ rel_2d_pos=rel_2d_pos,
+ training=training,
+ )
+
+ hidden_states = layer_outputs[0]
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if return_dict:
+ return TFBaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ )
+ else:
+ return tuple(
+ value for value in [hidden_states, all_hidden_states, all_self_attentions] if value is not None
+ )
+
+
+@keras_serializable
+class TFLayoutLMv3MainLayer(tf.keras.layers.Layer):
+ config_class = LayoutLMv3Config
+
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.config = config
+
+ if config.text_embed:
+ self.embeddings = TFLayoutLMv3TextEmbeddings(config, name="embeddings")
+
+ if config.visual_embed:
+ self.patch_embed = TFLayoutLMv3PatchEmbeddings(config, name="patch_embed")
+ self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
+
+ if config.has_relative_attention_bias or config.has_spatial_attention_bias:
+ image_size = config.input_size // config.patch_size
+ self.init_visual_bbox(image_size=(image_size, image_size))
+
+ self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="norm")
+
+ self.encoder = TFLayoutLMv3Encoder(config, name="encoder")
+
+ def build(self, input_shape: tf.TensorShape):
+ if self.config.visual_embed:
+ image_size = self.config.input_size // self.config.patch_size
+ self.cls_token = self.add_weight(
+ shape=(1, 1, self.config.hidden_size),
+ initializer="zeros",
+ trainable=True,
+ dtype=tf.float32,
+ name="cls_token",
+ )
+ self.pos_embed = self.add_weight(
+ shape=(1, image_size * image_size + 1, self.config.hidden_size),
+ initializer="zeros",
+ trainable=True,
+ dtype=tf.float32,
+ name="pos_embed",
+ )
+
+ super().build(input_shape)
+
+ def get_input_embeddings(self) -> tf.keras.layers.Layer:
+ return self.embeddings.word_embeddings
+
+ def set_input_embeddings(self, value: tf.Variable):
+ self.embeddings.word_embeddings.weight = value
+
+ # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ raise NotImplementedError
+
+ def init_visual_bbox(self, image_size: Tuple[int, int], max_len: int = 1000):
+ # We should not hardcode max_len to 1000, but it is done by the reference implementation,
+ # so we keep it for compatibility with the pretrained weights. The more correct approach
+ # would have been to pass on max_len=config.max_2d_position_embeddings - 1.
+ height, width = image_size
+
+ visual_bbox_x = tf.range(0, max_len * (width + 1), max_len) // width
+ visual_bbox_x = tf.expand_dims(visual_bbox_x, axis=0)
+ visual_bbox_x = tf.tile(visual_bbox_x, [width, 1]) # (width, width + 1)
+
+ visual_bbox_y = tf.range(0, max_len * (height + 1), max_len) // height
+ visual_bbox_y = tf.expand_dims(visual_bbox_y, axis=1)
+ visual_bbox_y = tf.tile(visual_bbox_y, [1, height]) # (height + 1, height)
+
+ visual_bbox = tf.stack(
+ [visual_bbox_x[:, :-1], visual_bbox_y[:-1], visual_bbox_x[:, 1:], visual_bbox_y[1:]],
+ axis=-1,
+ )
+ visual_bbox = tf.reshape(visual_bbox, [-1, 4])
+
+ cls_token_box = tf.constant([[1, 1, max_len - 1, max_len - 1]], dtype=tf.int32)
+ self.visual_bbox = tf.concat([cls_token_box, visual_bbox], axis=0)
+
+ def calculate_visual_bbox(self, batch_size: int, dtype: tf.DType):
+ visual_bbox = tf.expand_dims(self.visual_bbox, axis=0)
+ visual_bbox = tf.tile(visual_bbox, [batch_size, 1, 1])
+ visual_bbox = tf.cast(visual_bbox, dtype=dtype)
+ return visual_bbox
+
+ def embed_image(self, pixel_values: tf.Tensor) -> tf.Tensor:
+ embeddings = self.patch_embed(pixel_values)
+
+ # add [CLS] token
+ batch_size = tf.shape(embeddings)[0]
+ cls_tokens = tf.tile(self.cls_token, [batch_size, 1, 1])
+ embeddings = tf.concat([cls_tokens, embeddings], axis=1)
+
+ # add position embeddings
+ if getattr(self, "pos_embed", None) is not None:
+ embeddings += self.pos_embed
+
+ embeddings = self.norm(embeddings)
+ return embeddings
+
+ def get_extended_attention_mask(self, attention_mask: tf.Tensor) -> tf.Tensor:
+ # Adapted from transformers.modelling_utils.ModuleUtilsMixin.get_extended_attention_mask
+
+ n_dims = len(attention_mask.shape)
+
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+ # ourselves in which case we just need to make it broadcastable to all heads.
+ if n_dims == 3:
+ extended_attention_mask = tf.expand_dims(attention_mask, axis=1)
+ elif n_dims == 2:
+ # Provided a padding mask of dimensions [batch_size, seq_length].
+ # Make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length].
+ extended_attention_mask = tf.expand_dims(attention_mask, axis=1) # (batch_size, 1, seq_length)
+ extended_attention_mask = tf.expand_dims(extended_attention_mask, axis=1) # (batch_size, 1, 1, seq_length)
+ else:
+ raise ValueError(f"Wrong shape for attention_mask (shape {attention_mask.shape}).")
+
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+ # masked positions, this operation will create a tensor which is 0.0 for
+ # positions we want to attend and -10000.0 for masked positions.
+ # Since we are adding it to the raw scores before the softmax, this is
+ # effectively the same as removing these entirely.
+ extended_attention_mask = tf.cast(extended_attention_mask, self.compute_dtype)
+ extended_attention_mask = (1.0 - extended_attention_mask) * LARGE_NEGATIVE
+
+ return extended_attention_mask
+
+ def get_head_mask(self, head_mask: Optional[tf.Tensor]) -> Union[tf.Tensor, List[Optional[tf.Tensor]]]:
+ if head_mask is None:
+ return [None] * self.config.num_hidden_layers
+
+ n_dims = tf.rank(head_mask)
+ if n_dims == 1:
+ # Gets a tensor with masks for each head (H).
+ head_mask = tf.expand_dims(head_mask, axis=0) # 1, num_heads
+ head_mask = tf.expand_dims(head_mask, axis=0) # 1, 1, num_heads
+ head_mask = tf.expand_dims(head_mask, axis=-1) # 1, 1, num_heads, 1
+ head_mask = tf.expand_dims(head_mask, axis=-1) # 1, 1, num_heads, 1, 1
+ head_mask = tf.tile(
+ head_mask, [self.config.num_hidden_layers, 1, 1, 1, 1]
+ ) # seq_length, 1, num_heads, 1, 1
+ elif n_dims == 2:
+ # Gets a tensor with masks for each layer (L) and head (H).
+ head_mask = tf.expand_dims(head_mask, axis=1) # seq_length, 1, num_heads
+ head_mask = tf.expand_dims(head_mask, axis=-1) # seq_length, 1, num_heads, 1
+ head_mask = tf.expand_dims(head_mask, axis=-1) # seq_length, 1, num_heads, 1, 1
+ elif n_dims != 5:
+ raise ValueError(f"Wrong shape for head_mask (shape {head_mask.shape}).")
+ assert tf.rank(head_mask) == 5, f"Got head_mask rank of {tf.rank(head_mask)}, but require 5."
+ head_mask = tf.cast(head_mask, self.compute_dtype)
+ return head_mask
+
+ @unpack_inputs
+ def call(
+ self,
+ input_ids: Optional[tf.Tensor] = None,
+ bbox: Optional[tf.Tensor] = None,
+ attention_mask: Optional[tf.Tensor] = None,
+ token_type_ids: Optional[tf.Tensor] = None,
+ position_ids: Optional[tf.Tensor] = None,
+ head_mask: Optional[tf.Tensor] = None,
+ inputs_embeds: Optional[tf.Tensor] = None,
+ pixel_values: Optional[tf.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[
+ TFBaseModelOutput,
+ Tuple[tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
+ ]:
+ # This method can be called with a variety of modalities:
+ # 1. text + layout
+ # 2. text + layout + image
+ # 3. image
+ # The complexity of this method is mostly just due to handling of these different modalities.
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+ if input_ids is not None:
+ input_shape = tf.shape(input_ids)
+ batch_size = input_shape[0]
+ seq_length = input_shape[1]
+ elif inputs_embeds is not None:
+ input_shape = tf.shape(inputs_embeds)
+ batch_size = input_shape[0]
+ seq_length = input_shape[1]
+ elif pixel_values is not None:
+ batch_size = tf.shape(pixel_values)[0]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds or pixel_values")
+
+ # Determine which integer dtype to use.
+ if input_ids is not None:
+ int_dtype = input_ids.dtype
+ elif bbox is not None:
+ int_dtype = bbox.dtype
+ elif attention_mask is not None:
+ int_dtype = attention_mask.dtype
+ elif token_type_ids is not None:
+ int_dtype = token_type_ids.dtype
+ else:
+ int_dtype = tf.int32
+
+ if input_ids is not None or inputs_embeds is not None:
+ if attention_mask is None:
+ attention_mask = tf.ones((batch_size, seq_length), dtype=int_dtype)
+ if token_type_ids is None:
+ token_type_ids = tf.zeros((batch_size, seq_length), dtype=int_dtype)
+ if bbox is None:
+ bbox = tf.zeros((batch_size, seq_length, 4), dtype=int_dtype)
+
+ embedding_output = self.embeddings(
+ input_ids=input_ids,
+ bbox=bbox,
+ position_ids=position_ids,
+ token_type_ids=token_type_ids,
+ inputs_embeds=inputs_embeds,
+ training=training,
+ )
+
+ final_bbox = None
+ final_position_ids = None
+ if pixel_values is not None:
+ # embed image
+ visual_embeddings = self.embed_image(pixel_values)
+
+ # calculate attention mask
+ visual_attention_mask = tf.ones((batch_size, tf.shape(visual_embeddings)[1]), dtype=int_dtype)
+ if attention_mask is None:
+ attention_mask = visual_attention_mask
+ else:
+ attention_mask = tf.concat([attention_mask, visual_attention_mask], axis=1)
+
+ # calculate bounding boxes
+ if self.config.has_spatial_attention_bias:
+ visual_bbox = self.calculate_visual_bbox(batch_size, int_dtype)
+ if bbox is None:
+ final_bbox = visual_bbox
+ else:
+ final_bbox = tf.concat([bbox, visual_bbox], axis=1)
+
+ # calculate position IDs
+ if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
+ visual_position_ids = tf.range(0, tf.shape(visual_embeddings)[1], dtype=int_dtype)
+ visual_position_ids = tf.expand_dims(visual_position_ids, axis=0)
+ visual_position_ids = tf.tile(visual_position_ids, [batch_size, 1])
+
+ if input_ids is not None or inputs_embeds is not None:
+ position_ids = tf.expand_dims(tf.range(0, seq_length, dtype=int_dtype), axis=0)
+ position_ids = tf.tile(position_ids, [batch_size, 1])
+ final_position_ids = tf.concat([position_ids, visual_position_ids], axis=1)
+ else:
+ final_position_ids = visual_position_ids
+
+ # calculate embeddings
+ if input_ids is None and inputs_embeds is None:
+ embedding_output = visual_embeddings
+ else:
+ embedding_output = tf.concat([embedding_output, visual_embeddings], axis=1)
+ embedding_output = self.LayerNorm(embedding_output)
+ embedding_output = self.dropout(embedding_output, training=training)
+
+ elif self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
+ if self.config.has_relative_attention_bias:
+ position_ids = tf.expand_dims(tf.range(0, seq_length, dtype=int_dtype), axis=0)
+ position_ids = tf.tile(position_ids, [batch_size, 1])
+ final_position_ids = position_ids
+
+ if self.config.has_spatial_attention_bias:
+ final_bbox = bbox
+
+ extended_attention_mask = self.get_extended_attention_mask(attention_mask)
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape batch_size x num_heads x seq_length x seq_length
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ head_mask = self.get_head_mask(head_mask)
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ bbox=final_bbox,
+ position_ids=final_position_ids,
+ attention_mask=extended_attention_mask,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = encoder_outputs[0]
+
+ if not return_dict:
+ return (sequence_output,) + encoder_outputs[1:]
+
+ return TFBaseModelOutput(
+ last_hidden_state=sequence_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+ return TFBaseModelOutput(
+ last_hidden_state=sequence_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+class TFLayoutLMv3PreTrainedModel(TFPreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = LayoutLMv3Config
+ base_model_prefix = "layoutlmv3"
+
+ @property
+ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+ size = self.config.input_size
+ image_shape = (2, self.config.num_channels, size, size)
+ pixel_values = tf.random.uniform(shape=image_shape, minval=-1, maxval=1)
+ return {
+ "input_ids": tf.constant(_DUMMY_INPUT_IDS, dtype=tf.int32),
+ "bbox": tf.constant(_DUMMY_BBOX, dtype=tf.int32),
+ "pixel_values": pixel_values,
+ }
+
+ @tf.function(
+ input_signature=[
+ {
+ "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
+ "bbox": tf.TensorSpec((None, None, 4), tf.int32, name="bbox"),
+ "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
+ "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+ }
+ ]
+ )
+ def serving(self, inputs):
+ """
+ Method used for serving the model.
+
+ Args:
+ inputs (`Dict[str, tf.Tensor]`):
+ The input of the saved model as a dictionary of tensors.
+ """
+ output = self.call(inputs)
+
+ return self.serving_output(output)
+
+
+LAYOUTLMV3_START_DOCSTRING = r"""
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+ as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+ behavior.
+
+
+
+ TF 2.0 models accepts two formats as inputs:
+
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
+
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
+ tensors in the first argument of the model call function: `model(inputs)`.
+
+
+
+ Parameters:
+ config ([`LayoutLMv3Config`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+LAYOUTLMV3_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+ token. See `pixel_values` for `patch_sequence_length`.
+
+ Indices can be obtained using [`LayoutLMv3Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+
+ bbox (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+ Bounding boxes of each input sequence tokens. Selected in the range `[0,
+ config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+ format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+ y1) represents the position of the lower right corner.
+
+ Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+ token. See `pixel_values` for `patch_sequence_length`.
+
+ pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+ Batch of document images. Each image is divided into patches of shape `(num_channels, config.patch_size,
+ config.patch_size)` and the total number of patches (=`patch_sequence_length`) equals to `((height /
+ config.patch_size) * (width / config.patch_size))`.
+
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+ token. See `pixel_values` for `patch_sequence_length`.
+
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+ 1]`:
+
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
+
+ Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+ token. See `pixel_values` for `patch_sequence_length`.
+
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+ token. See `pixel_values` for `patch_sequence_length`.
+
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare LayoutLMv3 Model transformer outputting raw hidden-states without any specific head on top.",
+ LAYOUTLMV3_START_DOCSTRING,
+)
+class TFLayoutLMv3Model(TFLayoutLMv3PreTrainedModel):
+ # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+ _keys_to_ignore_on_load_unexpected = [r"position_ids"]
+
+ def __init__(self, config, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+ self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
+ def call(
+ self,
+ input_ids: Optional[tf.Tensor] = None,
+ bbox: Optional[tf.Tensor] = None,
+ attention_mask: Optional[tf.Tensor] = None,
+ token_type_ids: Optional[tf.Tensor] = None,
+ position_ids: Optional[tf.Tensor] = None,
+ head_mask: Optional[tf.Tensor] = None,
+ inputs_embeds: Optional[tf.Tensor] = None,
+ pixel_values: Optional[tf.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[
+ TFBaseModelOutput,
+ Tuple[tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
+ ]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoProcessor, TFAutoModel
+ >>> from datasets import load_dataset
+
+ >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+ >>> model = TFAutoModel.from_pretrained("microsoft/layoutlmv3-base")
+
+ >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+ >>> example = dataset[0]
+ >>> image = example["image"]
+ >>> words = example["tokens"]
+ >>> boxes = example["bboxes"]
+
+ >>> encoding = processor(image, words, boxes=boxes, return_tensors="tf")
+
+ >>> outputs = model(**encoding)
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
+
+ outputs = self.layoutlmv3(
+ input_ids=input_ids,
+ bbox=bbox,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ return outputs
+
+ def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput:
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+ return TFBaseModelOutput(
+ last_hidden_state=output.last_hidden_state,
+ hidden_states=hs,
+ attentions=attns,
+ )
+
+
+class TFLayoutLMv3ClassificationHead(tf.keras.layers.Layer):
+ """
+ Head for sentence-level classification tasks. Reference: RobertaClassificationHead
+ """
+
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(**kwargs)
+ self.dense = tf.keras.layers.Dense(
+ config.hidden_size,
+ activation="tanh",
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="dense",
+ )
+ classifier_dropout = (
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+ )
+ self.dropout = tf.keras.layers.Dropout(
+ classifier_dropout,
+ name="dropout",
+ )
+ self.out_proj = tf.keras.layers.Dense(
+ config.num_labels,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="out_proj",
+ )
+
+ def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
+ outputs = self.dropout(inputs, training=training)
+ outputs = self.dense(outputs)
+ outputs = self.dropout(outputs, training=training)
+ outputs = self.out_proj(outputs)
+ return outputs
+
+
+@add_start_docstrings(
+ """
+ LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
+ [CLS] token) e.g. for document image classification tasks such as the
+ [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
+ """,
+ LAYOUTLMV3_START_DOCSTRING,
+)
+class TFLayoutLMv3ForSequenceClassification(TFLayoutLMv3PreTrainedModel, TFSequenceClassificationLoss):
+ # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+ _keys_to_ignore_on_load_unexpected = [r"position_ids"]
+
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(config, **kwargs)
+ self.config = config
+ self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
+ self.classifier = TFLayoutLMv3ClassificationHead(config, name="classifier")
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+ def call(
+ self,
+ input_ids: Optional[tf.Tensor] = None,
+ attention_mask: Optional[tf.Tensor] = None,
+ token_type_ids: Optional[tf.Tensor] = None,
+ position_ids: Optional[tf.Tensor] = None,
+ head_mask: Optional[tf.Tensor] = None,
+ inputs_embeds: Optional[tf.Tensor] = None,
+ labels: Optional[tf.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ bbox: Optional[tf.Tensor] = None,
+ pixel_values: Optional[tf.Tensor] = None,
+ training: Optional[bool] = False,
+ ) -> Union[
+ TFSequenceClassifierOutput,
+ Tuple[tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
+ ]:
+ """
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoProcessor, TFAutoModelForSequenceClassification
+ >>> from datasets import load_dataset
+ >>> import tensorflow as tf
+
+ >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+ >>> model = TFAutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
+
+ >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+ >>> example = dataset[0]
+ >>> image = example["image"]
+ >>> words = example["tokens"]
+ >>> boxes = example["bboxes"]
+
+ >>> encoding = processor(image, words, boxes=boxes, return_tensors="tf")
+ >>> sequence_label = tf.convert_to_tensor([1])
+
+ >>> outputs = model(**encoding, labels=sequence_label)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```"""
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.layoutlmv3(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ bbox=bbox,
+ pixel_values=pixel_values,
+ training=training,
+ )
+ sequence_output = outputs[0][:, 0, :]
+ logits = self.classifier(sequence_output, training=training)
+
+ loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFSequenceClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
+ def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+ return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
+
+
+@add_start_docstrings(
+ """
+ LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
+ for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
+ [SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
+ [Kleister-NDA](https://github.com/applicaai/kleister-nda).
+ """,
+ LAYOUTLMV3_START_DOCSTRING,
+)
+class TFLayoutLMv3ForTokenClassification(TFLayoutLMv3PreTrainedModel, TFTokenClassificationLoss):
+ # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+ _keys_to_ignore_on_load_unexpected = [r"position_ids"]
+
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(config, **kwargs)
+ self.num_labels = config.num_labels
+
+ self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
+ self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
+ if config.num_labels < 10:
+ self.classifier = tf.keras.layers.Dense(
+ config.num_labels,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="classifier",
+ )
+ else:
+ self.classifier = TFLayoutLMv3ClassificationHead(config, name="classifier")
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
+ def call(
+ self,
+ input_ids: Optional[tf.Tensor] = None,
+ bbox: Optional[tf.Tensor] = None,
+ attention_mask: Optional[tf.Tensor] = None,
+ token_type_ids: Optional[tf.Tensor] = None,
+ position_ids: Optional[tf.Tensor] = None,
+ head_mask: Optional[tf.Tensor] = None,
+ inputs_embeds: Optional[tf.Tensor] = None,
+ labels: Optional[tf.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ pixel_values: Optional[tf.Tensor] = None,
+ training: Optional[bool] = False,
+ ) -> Union[
+ TFTokenClassifierOutput,
+ Tuple[tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
+ ]:
+ r"""
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoProcessor, TFAutoModelForTokenClassification
+ >>> from datasets import load_dataset
+
+ >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+ >>> model = TFAutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)
+
+ >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+ >>> example = dataset[0]
+ >>> image = example["image"]
+ >>> words = example["tokens"]
+ >>> boxes = example["bboxes"]
+ >>> word_labels = example["ner_tags"]
+
+ >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="tf")
+
+ >>> outputs = model(**encoding)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```"""
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.layoutlmv3(
+ input_ids,
+ bbox=bbox,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ pixel_values=pixel_values,
+ training=training,
+ )
+ if input_ids is not None:
+ input_shape = tf.shape(input_ids)
+ else:
+ input_shape = tf.shape(inputs_embeds)[:-1]
+
+ seq_length = input_shape[1]
+ # only take the text part of the output representations
+ sequence_output = outputs[0][:, :seq_length]
+ sequence_output = self.dropout(sequence_output, training=training)
+ logits = self.classifier(sequence_output)
+
+ loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFTokenClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
+ def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+ return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
+
+
+@add_start_docstrings(
+ """
+ LayoutLMv3 Model with a span classification head on top for extractive question-answering tasks such as
+ [DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to
+ compute `span start logits` and `span end logits`).
+ """,
+ LAYOUTLMV3_START_DOCSTRING,
+)
+class TFLayoutLMv3ForQuestionAnswering(TFLayoutLMv3PreTrainedModel, TFQuestionAnsweringLoss):
+ # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+ _keys_to_ignore_on_load_unexpected = [r"position_ids"]
+
+ def __init__(self, config: LayoutLMv3Config, **kwargs):
+ super().__init__(config, **kwargs)
+
+ self.num_labels = config.num_labels
+
+ self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
+ self.qa_outputs = TFLayoutLMv3ClassificationHead(config, name="qa_outputs")
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
+ def call(
+ self,
+ input_ids: Optional[tf.Tensor] = None,
+ attention_mask: Optional[tf.Tensor] = None,
+ token_type_ids: Optional[tf.Tensor] = None,
+ position_ids: Optional[tf.Tensor] = None,
+ head_mask: Optional[tf.Tensor] = None,
+ inputs_embeds: Optional[tf.Tensor] = None,
+ start_positions: Optional[tf.Tensor] = None,
+ end_positions: Optional[tf.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ bbox: Optional[tf.Tensor] = None,
+ pixel_values: Optional[tf.Tensor] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[
+ TFQuestionAnsweringModelOutput,
+ Tuple[tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
+ Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
+ ]:
+ r"""
+ start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoProcessor, TFAutoModelForQuestionAnswering
+ >>> from datasets import load_dataset
+ >>> import tensorflow as tf
+
+ >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+ >>> model = TFAutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")
+
+ >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+ >>> example = dataset[0]
+ >>> image = example["image"]
+ >>> question = "what's his name?"
+ >>> words = example["tokens"]
+ >>> boxes = example["bboxes"]
+
+ >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="tf")
+ >>> start_positions = tf.convert_to_tensor([1])
+ >>> end_positions = tf.convert_to_tensor([3])
+
+ >>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
+ >>> loss = outputs.loss
+ >>> start_scores = outputs.start_logits
+ >>> end_scores = outputs.end_logits
+ ```"""
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.layoutlmv3(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ bbox=bbox,
+ pixel_values=pixel_values,
+ training=training,
+ )
+
+ sequence_output = outputs[0]
+
+ logits = self.qa_outputs(sequence_output, training=training)
+ start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
+ start_logits = tf.squeeze(input=start_logits, axis=-1)
+ end_logits = tf.squeeze(input=end_logits, axis=-1)
+
+ loss = None
+
+ if start_positions is not None and end_positions is not None:
+ labels = {"start_position": start_positions, "end_position": end_positions}
+ loss = self.hf_compute_loss(labels, logits=(start_logits, end_logits))
+
+ if not return_dict:
+ output = (start_logits, end_logits) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFQuestionAnsweringModelOutput(
+ loss=loss,
+ start_logits=start_logits,
+ end_logits=end_logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
+ def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+ return TFQuestionAnsweringModelOutput(
+ start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
+ )
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 5f8124ae5584..e77a414cdce4 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -1316,6 +1316,44 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
+TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFLayoutLMv3ForQuestionAnswering(metaclass=DummyObject):
+ _backends = ["tf"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
+class TFLayoutLMv3ForSequenceClassification(metaclass=DummyObject):
+ _backends = ["tf"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
+class TFLayoutLMv3ForTokenClassification(metaclass=DummyObject):
+ _backends = ["tf"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
+class TFLayoutLMv3Model(metaclass=DummyObject):
+ _backends = ["tf"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
+class TFLayoutLMv3PreTrainedModel(metaclass=DummyObject):
+ _backends = ["tf"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
class TFLEDForConditionalGeneration(metaclass=DummyObject):
_backends = ["tf"]
diff --git a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
new file mode 100644
index 000000000000..f71aeb0aefb4
--- /dev/null
+++ b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
@@ -0,0 +1,497 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TensorFlow LayoutLMv3 model. """
+
+import copy
+import inspect
+import unittest
+
+import numpy as np
+
+from transformers import is_tf_available, is_vision_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_tf, slow
+from transformers.utils import cached_property
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_tf_available():
+ import tensorflow as tf
+
+ from transformers import (
+ TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST,
+ TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+ TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+ LayoutLMv3Config,
+ TFLayoutLMv3ForQuestionAnswering,
+ TFLayoutLMv3ForSequenceClassification,
+ TFLayoutLMv3ForTokenClassification,
+ TFLayoutLMv3Model,
+ )
+
+if is_vision_available():
+ from PIL import Image
+
+ from transformers import LayoutLMv3FeatureExtractor
+
+
+class TFLayoutLMv3ModelTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=2,
+ num_channels=3,
+ image_size=4,
+ patch_size=2,
+ text_seq_length=7,
+ is_training=True,
+ use_input_mask=True,
+ use_token_type_ids=True,
+ use_labels=True,
+ vocab_size=99,
+ hidden_size=36,
+ num_hidden_layers=3,
+ num_attention_heads=4,
+ intermediate_size=37,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.1,
+ attention_probs_dropout_prob=0.1,
+ max_position_embeddings=512,
+ type_vocab_size=16,
+ type_sequence_label_size=2,
+ initializer_range=0.02,
+ coordinate_size=6,
+ shape_size=6,
+ num_labels=3,
+ num_choices=4,
+ scope=None,
+ range_bbox=1000,
+ ):
+ self.parent = parent
+ self.batch_size = batch_size
+ self.num_channels = num_channels
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.is_training = is_training
+ self.use_input_mask = use_input_mask
+ self.use_token_type_ids = use_token_type_ids
+ self.use_labels = use_labels
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.hidden_act = hidden_act
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.max_position_embeddings = max_position_embeddings
+ self.type_vocab_size = type_vocab_size
+ self.type_sequence_label_size = type_sequence_label_size
+ self.initializer_range = initializer_range
+ self.coordinate_size = coordinate_size
+ self.shape_size = shape_size
+ self.num_labels = num_labels
+ self.num_choices = num_choices
+ self.scope = scope
+ self.range_bbox = range_bbox
+
+ # LayoutLMv3's sequence length equals the number of text tokens + number of patches + 1 (we add 1 for the CLS token)
+ self.text_seq_length = text_seq_length
+ self.image_seq_length = (image_size // patch_size) ** 2 + 1
+ self.seq_length = self.text_seq_length + self.image_seq_length
+
+ def prepare_config_and_inputs(self):
+ input_ids = ids_tensor([self.batch_size, self.text_seq_length], self.vocab_size)
+
+ bbox = ids_tensor([self.batch_size, self.text_seq_length, 4], self.range_bbox)
+ bbox = bbox.numpy()
+ # Ensure that bbox is legal
+ for i in range(bbox.shape[0]):
+ for j in range(bbox.shape[1]):
+ if bbox[i, j, 3] < bbox[i, j, 1]:
+ tmp_coordinate = bbox[i, j, 3]
+ bbox[i, j, 3] = bbox[i, j, 1]
+ bbox[i, j, 1] = tmp_coordinate
+ if bbox[i, j, 2] < bbox[i, j, 0]:
+ tmp_coordinate = bbox[i, j, 2]
+ bbox[i, j, 2] = bbox[i, j, 0]
+ bbox[i, j, 0] = tmp_coordinate
+ bbox = tf.constant(bbox)
+
+ pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+ input_mask = None
+ if self.use_input_mask:
+ input_mask = random_attention_mask([self.batch_size, self.text_seq_length])
+
+ token_type_ids = None
+ if self.use_token_type_ids:
+ token_type_ids = ids_tensor([self.batch_size, self.text_seq_length], self.type_vocab_size)
+
+ sequence_labels = None
+ token_labels = None
+ if self.use_labels:
+ sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+ token_labels = ids_tensor([self.batch_size, self.text_seq_length], self.num_labels)
+
+ config = LayoutLMv3Config(
+ vocab_size=self.vocab_size,
+ hidden_size=self.hidden_size,
+ num_hidden_layers=self.num_hidden_layers,
+ num_attention_heads=self.num_attention_heads,
+ intermediate_size=self.intermediate_size,
+ hidden_act=self.hidden_act,
+ hidden_dropout_prob=self.hidden_dropout_prob,
+ attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+ max_position_embeddings=self.max_position_embeddings,
+ type_vocab_size=self.type_vocab_size,
+ initializer_range=self.initializer_range,
+ coordinate_size=self.coordinate_size,
+ shape_size=self.shape_size,
+ input_size=self.image_size,
+ patch_size=self.patch_size,
+ )
+
+ return config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels, token_labels
+
+ def create_and_check_model(self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask):
+ model = TFLayoutLMv3Model(config=config)
+
+ # text + image
+ result = model(input_ids, pixel_values=pixel_values, training=False)
+ result = model(
+ input_ids,
+ bbox=bbox,
+ pixel_values=pixel_values,
+ attention_mask=input_mask,
+ token_type_ids=token_type_ids,
+ training=False,
+ )
+ result = model(input_ids, bbox=bbox, pixel_values=pixel_values, training=False)
+
+ self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+ # text only
+ result = model(input_ids, training=False)
+ self.parent.assertEqual(
+ result.last_hidden_state.shape, (self.batch_size, self.text_seq_length, self.hidden_size)
+ )
+
+ # image only
+ result = model({"pixel_values": pixel_values}, training=False)
+ self.parent.assertEqual(
+ result.last_hidden_state.shape, (self.batch_size, self.image_seq_length, self.hidden_size)
+ )
+
+ def create_and_check_for_sequence_classification(
+ self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels
+ ):
+ config.num_labels = self.num_labels
+ model = TFLayoutLMv3ForSequenceClassification(config=config)
+ result = model(
+ input_ids,
+ bbox=bbox,
+ pixel_values=pixel_values,
+ attention_mask=input_mask,
+ token_type_ids=token_type_ids,
+ labels=sequence_labels,
+ training=False,
+ )
+ self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+ def create_and_check_for_token_classification(
+ self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, token_labels
+ ):
+ config.num_labels = self.num_labels
+ model = TFLayoutLMv3ForTokenClassification(config=config)
+ result = model(
+ input_ids,
+ bbox=bbox,
+ pixel_values=pixel_values,
+ attention_mask=input_mask,
+ token_type_ids=token_type_ids,
+ labels=token_labels,
+ training=False,
+ )
+ self.parent.assertEqual(result.logits.shape, (self.batch_size, self.text_seq_length, self.num_labels))
+
+ def create_and_check_for_question_answering(
+ self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels
+ ):
+ config.num_labels = 2
+ model = TFLayoutLMv3ForQuestionAnswering(config=config)
+ result = model(
+ input_ids,
+ bbox=bbox,
+ pixel_values=pixel_values,
+ attention_mask=input_mask,
+ token_type_ids=token_type_ids,
+ start_positions=sequence_labels,
+ end_positions=sequence_labels,
+ training=False,
+ )
+ self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+ self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+ def prepare_config_and_inputs_for_common(self):
+ config_and_inputs = self.prepare_config_and_inputs()
+ (config, input_ids, bbox, pixel_values, token_type_ids, input_mask, _, _) = config_and_inputs
+ inputs_dict = {
+ "input_ids": input_ids,
+ "bbox": bbox,
+ "pixel_values": pixel_values,
+ "token_type_ids": token_type_ids,
+ "attention_mask": input_mask,
+ }
+ return config, inputs_dict
+
+
+@require_tf
+class TFLayoutLMv3ModelTest(TFModelTesterMixin, unittest.TestCase):
+
+ all_model_classes = (
+ (
+ TFLayoutLMv3Model,
+ TFLayoutLMv3ForQuestionAnswering,
+ TFLayoutLMv3ForSequenceClassification,
+ TFLayoutLMv3ForTokenClassification,
+ )
+ if is_tf_available()
+ else ()
+ )
+
+ test_pruning = False
+ test_resize_embeddings = False
+ test_onnx = False
+
+ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
+ inputs_dict = copy.deepcopy(inputs_dict)
+
+ if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+ inputs_dict = {
+ k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
+ if isinstance(v, tf.Tensor) and v.ndim > 0
+ else v
+ for k, v in inputs_dict.items()
+ }
+
+ if return_labels:
+ if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+ inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
+ elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+ inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+ inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+ elif model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
+ inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+ elif model_class in get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING):
+ inputs_dict["labels"] = tf.zeros(
+ (self.model_tester.batch_size, self.model_tester.text_seq_length), dtype=tf.int32
+ )
+
+ return inputs_dict
+
+ def setUp(self):
+ self.model_tester = TFLayoutLMv3ModelTester(self)
+ self.config_tester = ConfigTester(self, config_class=LayoutLMv3Config, hidden_size=37)
+
+ def test_config(self):
+ self.config_tester.run_common_tests()
+
+ def test_loss_computation(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ if getattr(model, "hf_compute_loss", None):
+ # The number of elements in the loss should be the same as the number of elements in the label
+ prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+ added_label = prepared_for_class[
+ sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
+ ]
+ expected_loss_size = added_label.shape.as_list()[:1]
+
+ # Test that model correctly compute the loss with kwargs
+ prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+ input_ids = prepared_for_class.pop("input_ids")
+
+ loss = model(input_ids, **prepared_for_class)[0]
+ self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
+
+ # Test that model correctly compute the loss when we mask some positions
+ prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+ input_ids = prepared_for_class.pop("input_ids")
+ if "labels" in prepared_for_class:
+ labels = prepared_for_class["labels"].numpy()
+ if len(labels.shape) > 1 and labels.shape[1] != 1:
+ labels[0] = -100
+ prepared_for_class["labels"] = tf.convert_to_tensor(labels)
+ loss = model(input_ids, **prepared_for_class)[0]
+ self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
+ self.assertTrue(not np.any(np.isnan(loss.numpy())))
+
+ # Test that model correctly compute the loss with a dict
+ prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+ loss = model(prepared_for_class)[0]
+ self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
+
+ # Test that model correctly compute the loss with a tuple
+ prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+
+ # Get keys that were added with the _prepare_for_class function
+ label_keys = prepared_for_class.keys() - inputs_dict.keys()
+ signature = inspect.signature(model.call).parameters
+ signature_names = list(signature.keys())
+
+ # Create a dictionary holding the location of the tensors in the tuple
+ tuple_index_mapping = {0: "input_ids"}
+ for label_key in label_keys:
+ label_key_index = signature_names.index(label_key)
+ tuple_index_mapping[label_key_index] = label_key
+ sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
+ # Initialize a list with their default values, update the values and convert to a tuple
+ list_input = []
+
+ for name in signature_names:
+ if name != "kwargs":
+ list_input.append(signature[name].default)
+
+ for index, value in sorted_tuple_index_mapping:
+ list_input[index] = prepared_for_class[value]
+
+ tuple_input = tuple(list_input)
+
+ # Send to model
+ loss = model(tuple_input[:-1])[0]
+
+ self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
+
+ def test_model(self):
+ (
+ config,
+ input_ids,
+ bbox,
+ pixel_values,
+ token_type_ids,
+ input_mask,
+ _,
+ _,
+ ) = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_model(config, input_ids, bbox, pixel_values, token_type_ids, input_mask)
+
+ def test_model_various_embeddings(self):
+ (
+ config,
+ input_ids,
+ bbox,
+ pixel_values,
+ token_type_ids,
+ input_mask,
+ _,
+ _,
+ ) = self.model_tester.prepare_config_and_inputs()
+ for type in ["absolute", "relative_key", "relative_key_query"]:
+ config.position_embedding_type = type
+ self.model_tester.create_and_check_model(config, input_ids, bbox, pixel_values, token_type_ids, input_mask)
+
+ def test_for_sequence_classification(self):
+ (
+ config,
+ input_ids,
+ bbox,
+ pixel_values,
+ token_type_ids,
+ input_mask,
+ sequence_labels,
+ _,
+ ) = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_for_sequence_classification(
+ config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels
+ )
+
+ def test_for_token_classification(self):
+ (
+ config,
+ input_ids,
+ bbox,
+ pixel_values,
+ token_type_ids,
+ input_mask,
+ _,
+ token_labels,
+ ) = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_for_token_classification(
+ config, input_ids, bbox, pixel_values, token_type_ids, input_mask, token_labels
+ )
+
+ def test_for_question_answering(self):
+ (
+ config,
+ input_ids,
+ bbox,
+ pixel_values,
+ token_type_ids,
+ input_mask,
+ sequence_labels,
+ _,
+ ) = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_for_question_answering(
+ config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels
+ )
+
+ @slow
+ def test_model_from_pretrained(self):
+ for model_name in TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+ model = TFLayoutLMv3Model.from_pretrained(model_name)
+ self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+ image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+ return image
+
+
+@require_tf
+class TFLayoutLMv3ModelIntegrationTest(unittest.TestCase):
+ @cached_property
+ def default_feature_extractor(self):
+ return LayoutLMv3FeatureExtractor(apply_ocr=False) if is_vision_available() else None
+
+ @slow
+ def test_inference_no_head(self):
+ model = TFLayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base")
+
+ feature_extractor = self.default_feature_extractor
+ image = prepare_img()
+ pixel_values = feature_extractor(images=image, return_tensors="tf").pixel_values
+
+ input_ids = tf.constant([[1, 2]])
+ bbox = tf.expand_dims(tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]]), axis=0)
+
+ # forward pass
+ outputs = model(input_ids=input_ids, bbox=bbox, pixel_values=pixel_values, training=False)
+
+ # verify the logits
+ expected_shape = (1, 199, 768)
+ self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+
+ expected_slice = tf.constant(
+ [[-0.0529, 0.3618, 0.1632], [-0.1587, -0.1667, -0.0400], [-0.1557, -0.1671, -0.0505]]
+ )
+
+ self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 0edda8ae5a4c..b03dcf511731 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -38,6 +38,7 @@ src/transformers/models/gptj/modeling_gptj.py
src/transformers/models/hubert/modeling_hubert.py
src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
src/transformers/models/longformer/modeling_longformer.py
src/transformers/models/longformer/modeling_tf_longformer.py
src/transformers/models/longt5/modeling_longt5.py