diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 17c04376780a..ed04cad3dd9b 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -38,7 +38,7 @@ The documentation is organized in five parts: - **GET STARTED** contains a quick tour and installation instructions to get up and running with 🤗 Transformers. - **TUTORIALS** are a great place to begin if you are new to our library. This section will help you gain the basic skills you need to start using 🤗 Transformers. - **HOW-TO GUIDES** will show you how to achieve a specific goal like fine-tuning a pretrained model for language modeling or how to create a custom model head. -- **CONCEPTUAL GUIDES** provides more discussion and explanation of the underlying concepts and ideas behind models, tasks, and the design philosophy of 🤗 Transformers. +- **CONCEPTUAL GUIDES** provides more discussion and explanation of the underlying concepts and ideas behind models, tasks, and the design philosophy of 🤗 Transformers. - **API** describes each class and function, grouped in: - **MAIN CLASSES** for the main classes exposing the important APIs of the library. @@ -245,7 +245,7 @@ Flax), PyTorch, and/or TensorFlow. | ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ | | LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ | | LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ | -| LayoutLMv3 | ✅ | ✅ | ✅ | ❌ | ❌ | +| LayoutLMv3 | ✅ | ✅ | ✅ | ✅ | ❌ | | LED | ✅ | ✅ | ✅ | ✅ | ❌ | | LeViT | ❌ | ❌ | ✅ | ❌ | ❌ | | Longformer | ✅ | ✅ | ✅ | ✅ | ❌ | diff --git a/docs/source/en/model_doc/layoutlmv3.mdx b/docs/source/en/model_doc/layoutlmv3.mdx index 8f115cf96ea5..37fb0dc30446 100644 --- a/docs/source/en/model_doc/layoutlmv3.mdx +++ b/docs/source/en/model_doc/layoutlmv3.mdx @@ -26,18 +26,18 @@ Tips: - In terms of data processing, LayoutLMv3 is identical to its predecessor [LayoutLMv2](layoutlmv2), except that: - images need to be resized and normalized with channels in regular RGB format. LayoutLMv2 on the other hand normalizes the images internally and expects the channels in BGR format. - - text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece. + - text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece. Due to these differences in data preprocessing, one can use [`LayoutLMv3Processor`] which internally combines a [`LayoutLMv3FeatureExtractor`] (for the image modality) and a [`LayoutLMv3Tokenizer`]/[`LayoutLMv3TokenizerFast`] (for the text modality) to prepare all data for the model. -- Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-layoutlmv2processor) of its predecessor. +- Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-layoutlmv2processor) of its predecessor. - Demo notebooks for LayoutLMv3 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LayoutLMv3). - Demo scripts can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/layoutlmv3). +alt="drawing" width="600"/> LayoutLMv3 architecture. Taken from the original paper. -This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/layoutlmv3). +This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version of this model was added by [chriskoo](https://huggingface.co/chriskoo), [tokec](https://huggingface.co/tokec), and [lre](https://huggingface.co/lre). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/layoutlmv3). ## LayoutLMv3Config @@ -84,3 +84,23 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi [[autodoc]] LayoutLMv3ForQuestionAnswering - forward + +## TFLayoutLMv3Model + +[[autodoc]] TFLayoutLMv3Model + - call + +## TFLayoutLMv3ForSequenceClassification + +[[autodoc]] TFLayoutLMv3ForSequenceClassification + - call + +## TFLayoutLMv3ForTokenClassification + +[[autodoc]] TFLayoutLMv3ForTokenClassification + - call + +## TFLayoutLMv3ForQuestionAnswering + +[[autodoc]] TFLayoutLMv3ForQuestionAnswering + - call diff --git a/docs/source/it/index.mdx b/docs/source/it/index.mdx index d5e10b7c4983..3ee8da15ed2d 100644 --- a/docs/source/it/index.mdx +++ b/docs/source/it/index.mdx @@ -221,7 +221,7 @@ tokenizer (chiamato "slow"). Un tokenizer "fast" supportato dalla libreria 🤗 | ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ | | LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ | | LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ | -| LayoutLMv3 | ✅ | ✅ | ✅ | ❌ | ❌ | +| LayoutLMv3 | ✅ | ✅ | ✅ | ✅ | ❌ | | LED | ✅ | ✅ | ✅ | ✅ | ❌ | | Longformer | ✅ | ✅ | ✅ | ✅ | ❌ | | LUKE | ✅ | ❌ | ✅ | ❌ | ❌ | @@ -288,4 +288,4 @@ tokenizer (chiamato "slow"). Un tokenizer "fast" supportato dalla libreria 🤗 | YOLOS | ❌ | ❌ | ✅ | ❌ | ❌ | | YOSO | ❌ | ❌ | ✅ | ❌ | ❌ | - \ No newline at end of file + diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 3281d266a2f3..bb64fe9295da 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2343,6 +2343,16 @@ "TFLayoutLMPreTrainedModel", ] ) + _import_structure["models.layoutlmv3"].extend( + [ + "TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFLayoutLMv3ForQuestionAnswering", + "TFLayoutLMv3ForSequenceClassification", + "TFLayoutLMv3ForTokenClassification", + "TFLayoutLMv3Model", + "TFLayoutLMv3PreTrainedModel", + ] + ) _import_structure["models.led"].extend(["TFLEDForConditionalGeneration", "TFLEDModel", "TFLEDPreTrainedModel"]) _import_structure["models.longformer"].extend( [ @@ -4801,6 +4811,14 @@ TFHubertModel, TFHubertPreTrainedModel, ) + from .models.layoutlmv3 import ( + TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST, + TFLayoutLMv3ForQuestionAnswering, + TFLayoutLMv3ForSequenceClassification, + TFLayoutLMv3ForTokenClassification, + TFLayoutLMv3Model, + TFLayoutLMv3PreTrainedModel, + ) from .models.led import TFLEDForConditionalGeneration, TFLEDModel, TFLEDPreTrainedModel from .models.longformer import ( TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index 359e1f05c47b..991bb79a6b37 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -52,6 +52,7 @@ ("gptj", "TFGPTJModel"), ("hubert", "TFHubertModel"), ("layoutlm", "TFLayoutLMModel"), + ("layoutlmv3", "TFLayoutLMv3Model"), ("led", "TFLEDModel"), ("longformer", "TFLongformerModel"), ("lxmert", "TFLxmertModel"), @@ -268,6 +269,7 @@ ("gpt2", "TFGPT2ForSequenceClassification"), ("gptj", "TFGPTJForSequenceClassification"), ("layoutlm", "TFLayoutLMForSequenceClassification"), + ("layoutlmv3", "TFLayoutLMv3ForSequenceClassification"), ("longformer", "TFLongformerForSequenceClassification"), ("mobilebert", "TFMobileBertForSequenceClassification"), ("mpnet", "TFMPNetForSequenceClassification"), @@ -297,6 +299,7 @@ ("flaubert", "TFFlaubertForQuestionAnsweringSimple"), ("funnel", "TFFunnelForQuestionAnswering"), ("gptj", "TFGPTJForQuestionAnswering"), + ("layoutlmv3", "TFLayoutLMv3ForQuestionAnswering"), ("longformer", "TFLongformerForQuestionAnswering"), ("mobilebert", "TFMobileBertForQuestionAnswering"), ("mpnet", "TFMPNetForQuestionAnswering"), @@ -316,7 +319,6 @@ ] ) - TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict( [ # Model for Token Classification mapping @@ -331,6 +333,7 @@ ("flaubert", "TFFlaubertForTokenClassification"), ("funnel", "TFFunnelForTokenClassification"), ("layoutlm", "TFLayoutLMForTokenClassification"), + ("layoutlmv3", "TFLayoutLMv3ForTokenClassification"), ("longformer", "TFLongformerForTokenClassification"), ("mobilebert", "TFMobileBertForTokenClassification"), ("mpnet", "TFMPNetForTokenClassification"), @@ -373,7 +376,6 @@ ] ) - TF_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_MAPPING_NAMES) TF_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES) TF_MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES) diff --git a/src/transformers/models/layoutlmv3/__init__.py b/src/transformers/models/layoutlmv3/__init__.py index cfa26057e87b..68a07362dc41 100644 --- a/src/transformers/models/layoutlmv3/__init__.py +++ b/src/transformers/models/layoutlmv3/__init__.py @@ -21,6 +21,7 @@ from ...utils import ( OptionalDependencyNotAvailable, _LazyModule, + is_tf_available, is_tokenizers_available, is_torch_available, is_vision_available, @@ -60,6 +61,21 @@ "LayoutLMv3PreTrainedModel", ] +try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_tf_layoutlmv3"] = [ + "TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFLayoutLMv3ForQuestionAnswering", + "TFLayoutLMv3ForSequenceClassification", + "TFLayoutLMv3ForTokenClassification", + "TFLayoutLMv3Model", + "TFLayoutLMv3PreTrainedModel", + ] + try: if not is_vision_available(): raise OptionalDependencyNotAvailable() @@ -101,6 +117,21 @@ LayoutLMv3PreTrainedModel, ) + try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_tf_layoutlmv3 import ( + TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST, + TFLayoutLMv3ForQuestionAnswering, + TFLayoutLMv3ForSequenceClassification, + TFLayoutLMv3ForTokenClassification, + TFLayoutLMv3Model, + TFLayoutLMv3PreTrainedModel, + ) + try: if not is_vision_available(): raise OptionalDependencyNotAvailable() diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py new file mode 100644 index 000000000000..85a44e4ff52a --- /dev/null +++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py @@ -0,0 +1,1610 @@ +# coding=utf-8 +# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 LayoutLMv3 model.""" + +import collections +import math +from typing import Dict, List, Optional, Tuple, Union + +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFTokenClassificationLoss, + get_initializer, + keras_serializable, + unpack_inputs, +) +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings +from .configuration_layoutlmv3 import LayoutLMv3Config + + +_CONFIG_FOR_DOC = "LayoutLMv3Config" + +_DUMMY_INPUT_IDS = [ + [7, 6, 1], + [1, 2, 0], +] + +_DUMMY_BBOX = [ + [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], + [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]], +] + +TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "microsoft/layoutlmv3-base", + "microsoft/layoutlmv3-large", + # See all LayoutLMv3 models at https://huggingface.co/models?filter=layoutlmv3 +] + +LARGE_NEGATIVE = -1e8 + + +class TFLayoutLMv3PatchEmbeddings(tf.keras.layers.Layer): + """LayoutLMv3 image (patch) embeddings.""" + + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(**kwargs) + patch_sizes = ( + config.patch_size + if isinstance(config.patch_size, collections.abc.Iterable) + else (config.patch_size, config.patch_size) + ) + self.proj = tf.keras.layers.Conv2D( + filters=config.hidden_size, + kernel_size=patch_sizes, + strides=patch_sizes, + padding="valid", + data_format="channels_last", + use_bias=True, + kernel_initializer=get_initializer(config.initializer_range), + name="proj", + ) + self.hidden_size = config.hidden_size + self.num_patches = (config.input_size**2) // (patch_sizes[0] * patch_sizes[1]) + + def call(self, pixel_values: tf.Tensor) -> tf.Tensor: + # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # So change the input format from `NCHW` to `NHWC`. + pixel_values = tf.transpose(pixel_values, perm=[0, 2, 3, 1]) + + embeddings = self.proj(pixel_values) + embeddings = tf.reshape(embeddings, (-1, self.num_patches, self.hidden_size)) + return embeddings + + +class TFLayoutLMv3TextEmbeddings(tf.keras.layers.Layer): + """ + LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings. + """ + + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(**kwargs) + self.word_embeddings = tf.keras.layers.Embedding( + config.vocab_size, + config.hidden_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="word_embeddings", + ) + self.token_type_embeddings = tf.keras.layers.Embedding( + config.type_vocab_size, + config.hidden_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="token_type_embeddings", + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.padding_token_index = config.pad_token_id + self.position_embeddings = tf.keras.layers.Embedding( + config.max_position_embeddings, + config.hidden_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="position_embeddings", + ) + self.x_position_embeddings = tf.keras.layers.Embedding( + config.max_2d_position_embeddings, + config.coordinate_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="x_position_embeddings", + ) + self.y_position_embeddings = tf.keras.layers.Embedding( + config.max_2d_position_embeddings, + config.coordinate_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="y_position_embeddings", + ) + self.h_position_embeddings = tf.keras.layers.Embedding( + config.max_2d_position_embeddings, + config.shape_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="h_position_embeddings", + ) + self.w_position_embeddings = tf.keras.layers.Embedding( + config.max_2d_position_embeddings, + config.shape_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="w_position_embeddings", + ) + self.max_2d_positions = config.max_2d_position_embeddings + + def calculate_spatial_position_embeddings(self, bbox: tf.Tensor) -> tf.Tensor: + try: + left_position_ids = bbox[:, :, 0] + upper_position_ids = bbox[:, :, 1] + right_position_ids = bbox[:, :, 2] + lower_position_ids = bbox[:, :, 3] + except IndexError as exception: + raise IndexError("Bounding box is not of shape (batch_size, seq_length, 4).") from exception + + try: + left_position_embeddings = self.x_position_embeddings(left_position_ids) + upper_position_embeddings = self.y_position_embeddings(upper_position_ids) + right_position_embeddings = self.x_position_embeddings(right_position_ids) + lower_position_embeddings = self.y_position_embeddings(lower_position_ids) + except IndexError as exception: + raise IndexError( + f"The `bbox` coordinate values should be within 0-{self.max_2d_positions} range." + ) from exception + + max_position_id = self.max_2d_positions - 1 + h_position_embeddings = self.h_position_embeddings( + tf.clip_by_value(bbox[:, :, 3] - bbox[:, :, 1], 0, max_position_id) + ) + w_position_embeddings = self.w_position_embeddings( + tf.clip_by_value(bbox[:, :, 2] - bbox[:, :, 0], 0, max_position_id) + ) + + # LayoutLMv1 sums the spatial embeddings, but LayoutLMv3 concatenates them. + spatial_position_embeddings = tf.concat( + [ + left_position_embeddings, + upper_position_embeddings, + right_position_embeddings, + lower_position_embeddings, + h_position_embeddings, + w_position_embeddings, + ], + axis=-1, + ) + return spatial_position_embeddings + + def create_position_ids_from_inputs_embeds(self, inputs_embds: tf.Tensor) -> tf.Tensor: + """ + We are provided embeddings directly. We cannot infer which are padded, so just generate sequential position + ids. + """ + input_shape = tf.shape(inputs_embds) + sequence_length = input_shape[1] + start_index = self.padding_token_index + 1 + end_index = self.padding_token_index + sequence_length + 1 + position_ids = tf.range(start_index, end_index, dtype=tf.int32) + batch_size = input_shape[0] + position_ids = tf.reshape(position_ids, (1, sequence_length)) + position_ids = tf.tile(position_ids, (batch_size, 1)) + return position_ids + + def create_position_ids_from_input_ids(self, input_ids: tf.Tensor) -> tf.Tensor: + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_token_index + 1. + """ + mask = tf.cast(tf.not_equal(input_ids, self.padding_token_index), input_ids.dtype) + position_ids = tf.cumsum(mask, axis=1) * mask + position_ids = position_ids + self.padding_token_index + return position_ids + + def create_position_ids(self, input_ids: tf.Tensor, inputs_embeds: tf.Tensor) -> tf.Tensor: + if input_ids is None: + return self.create_position_ids_from_inputs_embeds(inputs_embeds) + else: + return self.create_position_ids_from_input_ids(input_ids) + + def call( + self, + input_ids: Optional[tf.Tensor] = None, + bbox: tf.Tensor = None, + token_type_ids: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + inputs_embeds: Optional[tf.Tensor] = None, + training: bool = False, + ) -> tf.Tensor: + if position_ids is None: + position_ids = self.create_position_ids(input_ids, inputs_embeds) + + if input_ids is not None: + input_shape = tf.shape(input_ids) + else: + input_shape = tf.shape(inputs_embeds)[:-1] + + if token_type_ids is None: + token_type_ids = tf.zeros(input_shape, dtype=position_ids.dtype) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + + spatial_position_embeddings = self.calculate_spatial_position_embeddings(bbox) + + embeddings += spatial_position_embeddings + + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings, training=training) + return embeddings + + +class TFLayoutLMv3SelfAttention(tf.keras.layers.Layer): + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(**kwargs) + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.attention_score_normaliser = math.sqrt(self.attention_head_size) + + self.query = tf.keras.layers.Dense( + self.all_head_size, + kernel_initializer=get_initializer(config.initializer_range), + name="query", + ) + self.key = tf.keras.layers.Dense( + self.all_head_size, + kernel_initializer=get_initializer(config.initializer_range), + name="key", + ) + self.value = tf.keras.layers.Dense( + self.all_head_size, + kernel_initializer=get_initializer(config.initializer_range), + name="value", + ) + + self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.has_relative_attention_bias = config.has_relative_attention_bias + self.has_spatial_attention_bias = config.has_spatial_attention_bias + + def transpose_for_scores(self, x: tf.Tensor): + shape = tf.shape(x) + new_shape = ( + shape[0], # batch_size + shape[1], # seq_length + self.num_attention_heads, + self.attention_head_size, + ) + x = tf.reshape(x, new_shape) + return tf.transpose(x, perm=[0, 2, 1, 3]) # batch_size, num_heads, seq_length, attention_head_size + + def cogview_attention(self, attention_scores: tf.Tensor, alpha: Union[float, int] = 32): + """ + https://arxiv.org/abs/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation + (PB-Relax). A replacement of the original tf.keras.layers.Softmax(axis=-1)(attention_scores). Seems the new + attention_probs will result in a slower speed and a little bias. Can use + tf.debugging.assert_near(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison. The + smaller atol (e.g., 1e-08), the better. + """ + scaled_attention_scores = attention_scores / alpha + max_value = tf.expand_dims(tf.reduce_max(scaled_attention_scores, axis=-1), axis=-1) + new_attention_scores = (scaled_attention_scores - max_value) * alpha + return tf.math.softmax(new_attention_scores, axis=-1) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: Optional[tf.Tensor], + head_mask: Optional[tf.Tensor], + output_attentions: bool, + rel_pos: Optional[tf.Tensor] = None, + rel_2d_pos: Optional[tf.Tensor] = None, + training: bool = False, + ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + # Take the dot product between "query" and "key" to get the raw attention scores. + normalised_query_layer = query_layer / self.attention_score_normaliser + transposed_key_layer = tf.transpose( + key_layer, perm=[0, 1, 3, 2] + ) # batch_size, num_heads, attention_head_size, seq_length + attention_scores = tf.matmul(normalised_query_layer, transposed_key_layer) + + if self.has_relative_attention_bias and self.has_spatial_attention_bias: + attention_scores += (rel_pos + rel_2d_pos) / self.attention_score_normaliser + elif self.has_relative_attention_bias: + attention_scores += rel_pos / self.attention_score_normaliser + + if attention_mask is not None: + # Apply the attention mask (is precomputed for all layers in TFLayoutLMv3Model call() function) + attention_scores += attention_mask + + # Normalize the attention scores to probabilities. + # Use the trick of CogView paper to stabilize training. + attention_probs = self.cogview_attention(attention_scores) + + attention_probs = self.dropout(attention_probs, training=training) + + # Mask heads if we want to. + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = tf.matmul(attention_probs, value_layer) + context_layer = tf.transpose( + context_layer, perm=[0, 2, 1, 3] + ) # batch_size, seq_length, num_heads, attention_head_size + shape = tf.shape(context_layer) + context_layer = tf.reshape( + context_layer, (shape[0], shape[1], self.all_head_size) + ) # batch_size, seq_length, num_heads * attention_head_size + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from models.roberta.modeling_tf_roberta.TFRobertaSelfOutput +class TFLayoutLMv3SelfOutput(tf.keras.layers.Layer): + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + +class TFLayoutLMv3Attention(tf.keras.layers.Layer): + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(**kwargs) + self.self_attention = TFLayoutLMv3SelfAttention(config, name="self") + self.self_output = TFLayoutLMv3SelfOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: Optional[tf.Tensor], + head_mask: Optional[tf.Tensor], + output_attentions: bool, + rel_pos: Optional[tf.Tensor] = None, + rel_2d_pos: Optional[tf.Tensor] = None, + training: bool = False, + ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]: + self_outputs = self.self_attention( + hidden_states, + attention_mask, + head_mask, + output_attentions, + rel_pos, + rel_2d_pos, + training=training, + ) + attention_output = self.self_output(self_outputs[0], hidden_states, training=training) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from models.roberta.modeling_tf_bert.TFRobertaIntermediate +class TFLayoutLMv3Intermediate(tf.keras.layers.Layer): + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + +# Copied from models.roberta.modeling_tf_bert.TFRobertaOutput +class TFLayoutLMv3Output(tf.keras.layers.Layer): + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + +class TFLayoutLMv3Layer(tf.keras.layers.Layer): + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(**kwargs) + self.attention = TFLayoutLMv3Attention(config, name="attention") + self.intermediate = TFLayoutLMv3Intermediate(config, name="intermediate") + self.bert_output = TFLayoutLMv3Output(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: Optional[tf.Tensor], + head_mask: Optional[tf.Tensor], + output_attentions: bool, + rel_pos: Optional[tf.Tensor] = None, + rel_2d_pos: Optional[tf.Tensor] = None, + training: bool = False, + ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]: + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + rel_pos=rel_pos, + rel_2d_pos=rel_2d_pos, + training=training, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + intermediate_output = self.intermediate(attention_output) + layer_output = self.bert_output(intermediate_output, attention_output, training=training) + outputs = (layer_output,) + outputs + return outputs + + +class TFLayoutLMv3Encoder(tf.keras.layers.Layer): + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layer = [TFLayoutLMv3Layer(config, name=f"layer.{i}") for i in range(config.num_hidden_layers)] + + self.has_relative_attention_bias = config.has_relative_attention_bias + self.has_spatial_attention_bias = config.has_spatial_attention_bias + + if self.has_relative_attention_bias: + self.rel_pos_bins = config.rel_pos_bins + self.max_rel_pos = config.max_rel_pos + self.rel_pos_bias = tf.keras.layers.Dense( + units=config.num_attention_heads, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=False, + name="rel_pos_bias", + ) + + if self.has_spatial_attention_bias: + self.max_rel_2d_pos = config.max_rel_2d_pos + self.rel_2d_pos_bins = config.rel_2d_pos_bins + self.rel_pos_x_bias = tf.keras.layers.Dense( + units=config.num_attention_heads, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=False, + name="rel_pos_x_bias", + ) + self.rel_pos_y_bias = tf.keras.layers.Dense( + units=config.num_attention_heads, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=False, + name="rel_pos_y_bias", + ) + + def relative_position_bucket(self, relative_positions: tf.Tensor, num_buckets: int, max_distance: int): + # the negative relative positions are assigned to the interval [0, num_buckets / 2] + # we deal with this by assigning absolute relative positions to the interval [0, num_buckets / 2] + # and then offsetting the positive relative positions by num_buckets / 2 at the end + num_buckets = num_buckets // 2 + buckets = tf.abs(relative_positions) + + # half of the buckets are for exact increments in positions + max_exact_buckets = num_buckets // 2 + is_small = buckets < max_exact_buckets + + # the other half of the buckets are for logarithmically bigger bins in positions up to max_distance + buckets_log_ratio = tf.math.log(tf.cast(buckets, tf.float32) / max_exact_buckets) + distance_log_ratio = math.log(max_distance / max_exact_buckets) + buckets_big_offset = ( + buckets_log_ratio / distance_log_ratio * (num_buckets - max_exact_buckets) + ) # scale is [0, num_buckets - max_exact_buckets] + buckets_big = max_exact_buckets + buckets_big_offset # scale is [max_exact_buckets, num_buckets] + buckets_big = tf.cast(buckets_big, buckets.dtype) + buckets_big = tf.minimum(buckets_big, num_buckets - 1) + + return (tf.cast(relative_positions > 0, buckets.dtype) * num_buckets) + tf.where( + is_small, buckets, buckets_big + ) + + def _cal_pos_emb( + self, + dense_layer: tf.keras.layers.Dense, + position_ids: tf.Tensor, + num_buckets: int, + max_distance: int, + ): + rel_pos_matrix = tf.expand_dims(position_ids, axis=-2) - tf.expand_dims(position_ids, axis=-1) + rel_pos = self.relative_position_bucket(rel_pos_matrix, num_buckets, max_distance) + rel_pos_one_hot = tf.one_hot(rel_pos, depth=num_buckets, dtype=self.compute_dtype) + embedding = dense_layer(rel_pos_one_hot) + # batch_size, seq_length, seq_length, num_heads --> batch_size, num_heads, seq_length, seq_length + embedding = tf.transpose(embedding, [0, 3, 1, 2]) + embedding = tf.cast(embedding, dtype=self.compute_dtype) + return embedding + + def _cal_1d_pos_emb(self, position_ids: tf.Tensor): + return self._cal_pos_emb(self.rel_pos_bias, position_ids, self.rel_pos_bins, self.max_rel_pos) + + def _cal_2d_pos_emb(self, bbox: tf.Tensor): + position_coord_x = bbox[:, :, 0] # left + position_coord_y = bbox[:, :, 3] # bottom + rel_pos_x = self._cal_pos_emb( + self.rel_pos_x_bias, + position_coord_x, + self.rel_2d_pos_bins, + self.max_rel_2d_pos, + ) + rel_pos_y = self._cal_pos_emb( + self.rel_pos_y_bias, + position_coord_y, + self.rel_2d_pos_bins, + self.max_rel_2d_pos, + ) + rel_2d_pos = rel_pos_x + rel_pos_y + return rel_2d_pos + + def call( + self, + hidden_states: tf.Tensor, + bbox: Optional[tf.Tensor] = None, + attention_mask: Optional[tf.Tensor] = None, + head_mask: Optional[tf.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + position_ids: Optional[tf.Tensor] = None, + training: bool = False, + ) -> Union[ + TFBaseModelOutput, + Tuple[tf.Tensor], + Tuple[tf.Tensor, tf.Tensor], + Tuple[tf.Tensor, tf.Tensor, tf.Tensor], + ]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + rel_pos = self._cal_1d_pos_emb(position_ids) if self.has_relative_attention_bias else None + rel_2d_pos = self._cal_2d_pos_emb(bbox) if self.has_spatial_attention_bias else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + output_attentions, + rel_pos=rel_pos, + rel_2d_pos=rel_2d_pos, + training=training, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if return_dict: + return TFBaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + else: + return tuple( + value for value in [hidden_states, all_hidden_states, all_self_attentions] if value is not None + ) + + +@keras_serializable +class TFLayoutLMv3MainLayer(tf.keras.layers.Layer): + config_class = LayoutLMv3Config + + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(**kwargs) + + self.config = config + + if config.text_embed: + self.embeddings = TFLayoutLMv3TextEmbeddings(config, name="embeddings") + + if config.visual_embed: + self.patch_embed = TFLayoutLMv3PatchEmbeddings(config, name="patch_embed") + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + + if config.has_relative_attention_bias or config.has_spatial_attention_bias: + image_size = config.input_size // config.patch_size + self.init_visual_bbox(image_size=(image_size, image_size)) + + self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="norm") + + self.encoder = TFLayoutLMv3Encoder(config, name="encoder") + + def build(self, input_shape: tf.TensorShape): + if self.config.visual_embed: + image_size = self.config.input_size // self.config.patch_size + self.cls_token = self.add_weight( + shape=(1, 1, self.config.hidden_size), + initializer="zeros", + trainable=True, + dtype=tf.float32, + name="cls_token", + ) + self.pos_embed = self.add_weight( + shape=(1, image_size * image_size + 1, self.config.hidden_size), + initializer="zeros", + trainable=True, + dtype=tf.float32, + name="pos_embed", + ) + + super().build(input_shape) + + def get_input_embeddings(self) -> tf.keras.layers.Layer: + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value: tf.Variable): + self.embeddings.word_embeddings.weight = value + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + def init_visual_bbox(self, image_size: Tuple[int, int], max_len: int = 1000): + # We should not hardcode max_len to 1000, but it is done by the reference implementation, + # so we keep it for compatibility with the pretrained weights. The more correct approach + # would have been to pass on max_len=config.max_2d_position_embeddings - 1. + height, width = image_size + + visual_bbox_x = tf.range(0, max_len * (width + 1), max_len) // width + visual_bbox_x = tf.expand_dims(visual_bbox_x, axis=0) + visual_bbox_x = tf.tile(visual_bbox_x, [width, 1]) # (width, width + 1) + + visual_bbox_y = tf.range(0, max_len * (height + 1), max_len) // height + visual_bbox_y = tf.expand_dims(visual_bbox_y, axis=1) + visual_bbox_y = tf.tile(visual_bbox_y, [1, height]) # (height + 1, height) + + visual_bbox = tf.stack( + [visual_bbox_x[:, :-1], visual_bbox_y[:-1], visual_bbox_x[:, 1:], visual_bbox_y[1:]], + axis=-1, + ) + visual_bbox = tf.reshape(visual_bbox, [-1, 4]) + + cls_token_box = tf.constant([[1, 1, max_len - 1, max_len - 1]], dtype=tf.int32) + self.visual_bbox = tf.concat([cls_token_box, visual_bbox], axis=0) + + def calculate_visual_bbox(self, batch_size: int, dtype: tf.DType): + visual_bbox = tf.expand_dims(self.visual_bbox, axis=0) + visual_bbox = tf.tile(visual_bbox, [batch_size, 1, 1]) + visual_bbox = tf.cast(visual_bbox, dtype=dtype) + return visual_bbox + + def embed_image(self, pixel_values: tf.Tensor) -> tf.Tensor: + embeddings = self.patch_embed(pixel_values) + + # add [CLS] token + batch_size = tf.shape(embeddings)[0] + cls_tokens = tf.tile(self.cls_token, [batch_size, 1, 1]) + embeddings = tf.concat([cls_tokens, embeddings], axis=1) + + # add position embeddings + if getattr(self, "pos_embed", None) is not None: + embeddings += self.pos_embed + + embeddings = self.norm(embeddings) + return embeddings + + def get_extended_attention_mask(self, attention_mask: tf.Tensor) -> tf.Tensor: + # Adapted from transformers.modelling_utils.ModuleUtilsMixin.get_extended_attention_mask + + n_dims = len(attention_mask.shape) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if n_dims == 3: + extended_attention_mask = tf.expand_dims(attention_mask, axis=1) + elif n_dims == 2: + # Provided a padding mask of dimensions [batch_size, seq_length]. + # Make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]. + extended_attention_mask = tf.expand_dims(attention_mask, axis=1) # (batch_size, 1, seq_length) + extended_attention_mask = tf.expand_dims(extended_attention_mask, axis=1) # (batch_size, 1, 1, seq_length) + else: + raise ValueError(f"Wrong shape for attention_mask (shape {attention_mask.shape}).") + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, self.compute_dtype) + extended_attention_mask = (1.0 - extended_attention_mask) * LARGE_NEGATIVE + + return extended_attention_mask + + def get_head_mask(self, head_mask: Optional[tf.Tensor]) -> Union[tf.Tensor, List[Optional[tf.Tensor]]]: + if head_mask is None: + return [None] * self.config.num_hidden_layers + + n_dims = tf.rank(head_mask) + if n_dims == 1: + # Gets a tensor with masks for each head (H). + head_mask = tf.expand_dims(head_mask, axis=0) # 1, num_heads + head_mask = tf.expand_dims(head_mask, axis=0) # 1, 1, num_heads + head_mask = tf.expand_dims(head_mask, axis=-1) # 1, 1, num_heads, 1 + head_mask = tf.expand_dims(head_mask, axis=-1) # 1, 1, num_heads, 1, 1 + head_mask = tf.tile( + head_mask, [self.config.num_hidden_layers, 1, 1, 1, 1] + ) # seq_length, 1, num_heads, 1, 1 + elif n_dims == 2: + # Gets a tensor with masks for each layer (L) and head (H). + head_mask = tf.expand_dims(head_mask, axis=1) # seq_length, 1, num_heads + head_mask = tf.expand_dims(head_mask, axis=-1) # seq_length, 1, num_heads, 1 + head_mask = tf.expand_dims(head_mask, axis=-1) # seq_length, 1, num_heads, 1, 1 + elif n_dims != 5: + raise ValueError(f"Wrong shape for head_mask (shape {head_mask.shape}).") + assert tf.rank(head_mask) == 5, f"Got head_mask rank of {tf.rank(head_mask)}, but require 5." + head_mask = tf.cast(head_mask, self.compute_dtype) + return head_mask + + @unpack_inputs + def call( + self, + input_ids: Optional[tf.Tensor] = None, + bbox: Optional[tf.Tensor] = None, + attention_mask: Optional[tf.Tensor] = None, + token_type_ids: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + head_mask: Optional[tf.Tensor] = None, + inputs_embeds: Optional[tf.Tensor] = None, + pixel_values: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[ + TFBaseModelOutput, + Tuple[tf.Tensor], + Tuple[tf.Tensor, tf.Tensor], + Tuple[tf.Tensor, tf.Tensor, tf.Tensor], + ]: + # This method can be called with a variety of modalities: + # 1. text + layout + # 2. text + layout + image + # 3. image + # The complexity of this method is mostly just due to handling of these different modalities. + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if input_ids is not None: + input_shape = tf.shape(input_ids) + batch_size = input_shape[0] + seq_length = input_shape[1] + elif inputs_embeds is not None: + input_shape = tf.shape(inputs_embeds) + batch_size = input_shape[0] + seq_length = input_shape[1] + elif pixel_values is not None: + batch_size = tf.shape(pixel_values)[0] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds or pixel_values") + + # Determine which integer dtype to use. + if input_ids is not None: + int_dtype = input_ids.dtype + elif bbox is not None: + int_dtype = bbox.dtype + elif attention_mask is not None: + int_dtype = attention_mask.dtype + elif token_type_ids is not None: + int_dtype = token_type_ids.dtype + else: + int_dtype = tf.int32 + + if input_ids is not None or inputs_embeds is not None: + if attention_mask is None: + attention_mask = tf.ones((batch_size, seq_length), dtype=int_dtype) + if token_type_ids is None: + token_type_ids = tf.zeros((batch_size, seq_length), dtype=int_dtype) + if bbox is None: + bbox = tf.zeros((batch_size, seq_length, 4), dtype=int_dtype) + + embedding_output = self.embeddings( + input_ids=input_ids, + bbox=bbox, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + training=training, + ) + + final_bbox = None + final_position_ids = None + if pixel_values is not None: + # embed image + visual_embeddings = self.embed_image(pixel_values) + + # calculate attention mask + visual_attention_mask = tf.ones((batch_size, tf.shape(visual_embeddings)[1]), dtype=int_dtype) + if attention_mask is None: + attention_mask = visual_attention_mask + else: + attention_mask = tf.concat([attention_mask, visual_attention_mask], axis=1) + + # calculate bounding boxes + if self.config.has_spatial_attention_bias: + visual_bbox = self.calculate_visual_bbox(batch_size, int_dtype) + if bbox is None: + final_bbox = visual_bbox + else: + final_bbox = tf.concat([bbox, visual_bbox], axis=1) + + # calculate position IDs + if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias: + visual_position_ids = tf.range(0, tf.shape(visual_embeddings)[1], dtype=int_dtype) + visual_position_ids = tf.expand_dims(visual_position_ids, axis=0) + visual_position_ids = tf.tile(visual_position_ids, [batch_size, 1]) + + if input_ids is not None or inputs_embeds is not None: + position_ids = tf.expand_dims(tf.range(0, seq_length, dtype=int_dtype), axis=0) + position_ids = tf.tile(position_ids, [batch_size, 1]) + final_position_ids = tf.concat([position_ids, visual_position_ids], axis=1) + else: + final_position_ids = visual_position_ids + + # calculate embeddings + if input_ids is None and inputs_embeds is None: + embedding_output = visual_embeddings + else: + embedding_output = tf.concat([embedding_output, visual_embeddings], axis=1) + embedding_output = self.LayerNorm(embedding_output) + embedding_output = self.dropout(embedding_output, training=training) + + elif self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias: + if self.config.has_relative_attention_bias: + position_ids = tf.expand_dims(tf.range(0, seq_length, dtype=int_dtype), axis=0) + position_ids = tf.tile(position_ids, [batch_size, 1]) + final_position_ids = position_ids + + if self.config.has_spatial_attention_bias: + final_bbox = bbox + + extended_attention_mask = self.get_extended_attention_mask(attention_mask) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape batch_size x num_heads x seq_length x seq_length + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask) + + encoder_outputs = self.encoder( + embedding_output, + bbox=final_bbox, + position_ids=final_position_ids, + attention_mask=extended_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return TFBaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + return TFBaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class TFLayoutLMv3PreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = LayoutLMv3Config + base_model_prefix = "layoutlmv3" + + @property + def dummy_inputs(self) -> Dict[str, tf.Tensor]: + size = self.config.input_size + image_shape = (2, self.config.num_channels, size, size) + pixel_values = tf.random.uniform(shape=image_shape, minval=-1, maxval=1) + return { + "input_ids": tf.constant(_DUMMY_INPUT_IDS, dtype=tf.int32), + "bbox": tf.constant(_DUMMY_BBOX, dtype=tf.int32), + "pixel_values": pixel_values, + } + + @tf.function( + input_signature=[ + { + "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), + "bbox": tf.TensorSpec((None, None, 4), tf.int32, name="bbox"), + "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), + "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + } + ] + ) + def serving(self, inputs): + """ + Method used for serving the model. + + Args: + inputs (`Dict[str, tf.Tensor]`): + The input of the saved model as a dictionary of tensors. + """ + output = self.call(inputs) + + return self.serving_output(output) + + +LAYOUTLMV3_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the + tensors in the first argument of the model call function: `model(inputs)`. + + + + Parameters: + config ([`LayoutLMv3Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +LAYOUTLMV3_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS] + token. See `pixel_values` for `patch_sequence_length`. + + Indices can be obtained using [`LayoutLMv3Tokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + + bbox (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length, 4)`, *optional*): + Bounding boxes of each input sequence tokens. Selected in the range `[0, + config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1) + format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, + y1) represents the position of the lower right corner. + + Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS] + token. See `pixel_values` for `patch_sequence_length`. + + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Batch of document images. Each image is divided into patches of shape `(num_channels, config.patch_size, + config.patch_size)` and the total number of patches (=`patch_sequence_length`) equals to `((height / + config.patch_size) * (width / config.patch_size))`. + + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS] + token. See `pixel_values` for `patch_sequence_length`. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS] + token. See `pixel_values` for `patch_sequence_length`. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS] + token. See `pixel_values` for `patch_sequence_length`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare LayoutLMv3 Model transformer outputting raw hidden-states without any specific head on top.", + LAYOUTLMV3_START_DOCSTRING, +) +class TFLayoutLMv3Model(TFLayoutLMv3PreTrainedModel): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"position_ids"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3") + + @unpack_inputs + @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: Optional[tf.Tensor] = None, + bbox: Optional[tf.Tensor] = None, + attention_mask: Optional[tf.Tensor] = None, + token_type_ids: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + head_mask: Optional[tf.Tensor] = None, + inputs_embeds: Optional[tf.Tensor] = None, + pixel_values: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[ + TFBaseModelOutput, + Tuple[tf.Tensor], + Tuple[tf.Tensor, tf.Tensor], + Tuple[tf.Tensor, tf.Tensor, tf.Tensor], + ]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoProcessor, TFAutoModel + >>> from datasets import load_dataset + + >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) + >>> model = TFAutoModel.from_pretrained("microsoft/layoutlmv3-base") + + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> example = dataset[0] + >>> image = example["image"] + >>> words = example["tokens"] + >>> boxes = example["bboxes"] + + >>> encoding = processor(image, words, boxes=boxes, return_tensors="tf") + + >>> outputs = model(**encoding) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + + outputs = self.layoutlmv3( + input_ids=input_ids, + bbox=bbox, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFBaseModelOutput( + last_hidden_state=output.last_hidden_state, + hidden_states=hs, + attentions=attns, + ) + + +class TFLayoutLMv3ClassificationHead(tf.keras.layers.Layer): + """ + Head for sentence-level classification tasks. Reference: RobertaClassificationHead + """ + + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense( + config.hidden_size, + activation="tanh", + kernel_initializer=get_initializer(config.initializer_range), + name="dense", + ) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = tf.keras.layers.Dropout( + classifier_dropout, + name="dropout", + ) + self.out_proj = tf.keras.layers.Dense( + config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="out_proj", + ) + + def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor: + outputs = self.dropout(inputs, training=training) + outputs = self.dense(outputs) + outputs = self.dropout(outputs, training=training) + outputs = self.out_proj(outputs) + return outputs + + +@add_start_docstrings( + """ + LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the + [CLS] token) e.g. for document image classification tasks such as the + [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset. + """, + LAYOUTLMV3_START_DOCSTRING, +) +class TFLayoutLMv3ForSequenceClassification(TFLayoutLMv3PreTrainedModel, TFSequenceClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"position_ids"] + + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(config, **kwargs) + self.config = config + self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3") + self.classifier = TFLayoutLMv3ClassificationHead(config, name="classifier") + + @unpack_inputs + @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: Optional[tf.Tensor] = None, + attention_mask: Optional[tf.Tensor] = None, + token_type_ids: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + head_mask: Optional[tf.Tensor] = None, + inputs_embeds: Optional[tf.Tensor] = None, + labels: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + bbox: Optional[tf.Tensor] = None, + pixel_values: Optional[tf.Tensor] = None, + training: Optional[bool] = False, + ) -> Union[ + TFSequenceClassifierOutput, + Tuple[tf.Tensor], + Tuple[tf.Tensor, tf.Tensor], + Tuple[tf.Tensor, tf.Tensor, tf.Tensor], + Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor], + ]: + """ + Returns: + + Examples: + + ```python + >>> from transformers import AutoProcessor, TFAutoModelForSequenceClassification + >>> from datasets import load_dataset + >>> import tensorflow as tf + + >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) + >>> model = TFAutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base") + + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> example = dataset[0] + >>> image = example["image"] + >>> words = example["tokens"] + >>> boxes = example["bboxes"] + + >>> encoding = processor(image, words, boxes=boxes, return_tensors="tf") + >>> sequence_label = tf.convert_to_tensor([1]) + + >>> outputs = model(**encoding, labels=sequence_label) + >>> loss = outputs.loss + >>> logits = outputs.logits + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.layoutlmv3( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + bbox=bbox, + pixel_values=pixel_values, + training=training, + ) + sequence_output = outputs[0][:, 0, :] + logits = self.classifier(sequence_output, training=training) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns) + + +@add_start_docstrings( + """ + LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g. + for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/), + [SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and + [Kleister-NDA](https://github.com/applicaai/kleister-nda). + """, + LAYOUTLMV3_START_DOCSTRING, +) +class TFLayoutLMv3ForTokenClassification(TFLayoutLMv3PreTrainedModel, TFTokenClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"position_ids"] + + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(config, **kwargs) + self.num_labels = config.num_labels + + self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + if config.num_labels < 10: + self.classifier = tf.keras.layers.Dense( + config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", + ) + else: + self.classifier = TFLayoutLMv3ClassificationHead(config, name="classifier") + + @unpack_inputs + @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: Optional[tf.Tensor] = None, + bbox: Optional[tf.Tensor] = None, + attention_mask: Optional[tf.Tensor] = None, + token_type_ids: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + head_mask: Optional[tf.Tensor] = None, + inputs_embeds: Optional[tf.Tensor] = None, + labels: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + pixel_values: Optional[tf.Tensor] = None, + training: Optional[bool] = False, + ) -> Union[ + TFTokenClassifierOutput, + Tuple[tf.Tensor], + Tuple[tf.Tensor, tf.Tensor], + Tuple[tf.Tensor, tf.Tensor, tf.Tensor], + Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor], + ]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + + Returns: + + Examples: + + ```python + >>> from transformers import AutoProcessor, TFAutoModelForTokenClassification + >>> from datasets import load_dataset + + >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) + >>> model = TFAutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7) + + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> example = dataset[0] + >>> image = example["image"] + >>> words = example["tokens"] + >>> boxes = example["bboxes"] + >>> word_labels = example["ner_tags"] + + >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="tf") + + >>> outputs = model(**encoding) + >>> loss = outputs.loss + >>> logits = outputs.logits + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.layoutlmv3( + input_ids, + bbox=bbox, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + pixel_values=pixel_values, + training=training, + ) + if input_ids is not None: + input_shape = tf.shape(input_ids) + else: + input_shape = tf.shape(inputs_embeds)[:-1] + + seq_length = input_shape[1] + # only take the text part of the output representations + sequence_output = outputs[0][:, :seq_length] + sequence_output = self.dropout(sequence_output, training=training) + logits = self.classifier(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns) + + +@add_start_docstrings( + """ + LayoutLMv3 Model with a span classification head on top for extractive question-answering tasks such as + [DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to + compute `span start logits` and `span end logits`). + """, + LAYOUTLMV3_START_DOCSTRING, +) +class TFLayoutLMv3ForQuestionAnswering(TFLayoutLMv3PreTrainedModel, TFQuestionAnsweringLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"position_ids"] + + def __init__(self, config: LayoutLMv3Config, **kwargs): + super().__init__(config, **kwargs) + + self.num_labels = config.num_labels + + self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3") + self.qa_outputs = TFLayoutLMv3ClassificationHead(config, name="qa_outputs") + + @unpack_inputs + @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: Optional[tf.Tensor] = None, + attention_mask: Optional[tf.Tensor] = None, + token_type_ids: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + head_mask: Optional[tf.Tensor] = None, + inputs_embeds: Optional[tf.Tensor] = None, + start_positions: Optional[tf.Tensor] = None, + end_positions: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + bbox: Optional[tf.Tensor] = None, + pixel_values: Optional[tf.Tensor] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[ + TFQuestionAnsweringModelOutput, + Tuple[tf.Tensor], + Tuple[tf.Tensor, tf.Tensor], + Tuple[tf.Tensor, tf.Tensor, tf.Tensor], + Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor], + ]: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + + Returns: + + Examples: + + ```python + >>> from transformers import AutoProcessor, TFAutoModelForQuestionAnswering + >>> from datasets import load_dataset + >>> import tensorflow as tf + + >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) + >>> model = TFAutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base") + + >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") + >>> example = dataset[0] + >>> image = example["image"] + >>> question = "what's his name?" + >>> words = example["tokens"] + >>> boxes = example["bboxes"] + + >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="tf") + >>> start_positions = tf.convert_to_tensor([1]) + >>> end_positions = tf.convert_to_tensor([3]) + + >>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions) + >>> loss = outputs.loss + >>> start_scores = outputs.start_logits + >>> end_scores = outputs.end_logits + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.layoutlmv3( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + bbox=bbox, + pixel_values=pixel_values, + training=training, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output, training=training) + start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1) + start_logits = tf.squeeze(input=start_logits, axis=-1) + end_logits = tf.squeeze(input=end_logits, axis=-1) + + loss = None + + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions, "end_position": end_positions} + loss = self.hf_compute_loss(labels, logits=(start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFQuestionAnsweringModelOutput( + start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns + ) diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index 5f8124ae5584..e77a414cdce4 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -1316,6 +1316,44 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["tf"]) +TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class TFLayoutLMv3ForQuestionAnswering(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFLayoutLMv3ForSequenceClassification(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFLayoutLMv3ForTokenClassification(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFLayoutLMv3Model(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFLayoutLMv3PreTrainedModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + class TFLEDForConditionalGeneration(metaclass=DummyObject): _backends = ["tf"] diff --git a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py new file mode 100644 index 000000000000..f71aeb0aefb4 --- /dev/null +++ b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py @@ -0,0 +1,497 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the TensorFlow LayoutLMv3 model. """ + +import copy +import inspect +import unittest + +import numpy as np + +from transformers import is_tf_available, is_vision_available +from transformers.models.auto import get_values +from transformers.testing_utils import require_tf, slow +from transformers.utils import cached_property + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + + +if is_tf_available(): + import tensorflow as tf + + from transformers import ( + TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST, + TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING, + TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, + TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + LayoutLMv3Config, + TFLayoutLMv3ForQuestionAnswering, + TFLayoutLMv3ForSequenceClassification, + TFLayoutLMv3ForTokenClassification, + TFLayoutLMv3Model, + ) + +if is_vision_available(): + from PIL import Image + + from transformers import LayoutLMv3FeatureExtractor + + +class TFLayoutLMv3ModelTester: + def __init__( + self, + parent, + batch_size=2, + num_channels=3, + image_size=4, + patch_size=2, + text_seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=36, + num_hidden_layers=3, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + coordinate_size=6, + shape_size=6, + num_labels=3, + num_choices=4, + scope=None, + range_bbox=1000, + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.patch_size = patch_size + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.coordinate_size = coordinate_size + self.shape_size = shape_size + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.range_bbox = range_bbox + + # LayoutLMv3's sequence length equals the number of text tokens + number of patches + 1 (we add 1 for the CLS token) + self.text_seq_length = text_seq_length + self.image_seq_length = (image_size // patch_size) ** 2 + 1 + self.seq_length = self.text_seq_length + self.image_seq_length + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.text_seq_length], self.vocab_size) + + bbox = ids_tensor([self.batch_size, self.text_seq_length, 4], self.range_bbox) + bbox = bbox.numpy() + # Ensure that bbox is legal + for i in range(bbox.shape[0]): + for j in range(bbox.shape[1]): + if bbox[i, j, 3] < bbox[i, j, 1]: + tmp_coordinate = bbox[i, j, 3] + bbox[i, j, 3] = bbox[i, j, 1] + bbox[i, j, 1] = tmp_coordinate + if bbox[i, j, 2] < bbox[i, j, 0]: + tmp_coordinate = bbox[i, j, 2] + bbox[i, j, 2] = bbox[i, j, 0] + bbox[i, j, 0] = tmp_coordinate + bbox = tf.constant(bbox) + + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.text_seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.text_seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.text_seq_length], self.num_labels) + + config = LayoutLMv3Config( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + coordinate_size=self.coordinate_size, + shape_size=self.shape_size, + input_size=self.image_size, + patch_size=self.patch_size, + ) + + return config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels, token_labels + + def create_and_check_model(self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask): + model = TFLayoutLMv3Model(config=config) + + # text + image + result = model(input_ids, pixel_values=pixel_values, training=False) + result = model( + input_ids, + bbox=bbox, + pixel_values=pixel_values, + attention_mask=input_mask, + token_type_ids=token_type_ids, + training=False, + ) + result = model(input_ids, bbox=bbox, pixel_values=pixel_values, training=False) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + # text only + result = model(input_ids, training=False) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.text_seq_length, self.hidden_size) + ) + + # image only + result = model({"pixel_values": pixel_values}, training=False) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.image_seq_length, self.hidden_size) + ) + + def create_and_check_for_sequence_classification( + self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels + ): + config.num_labels = self.num_labels + model = TFLayoutLMv3ForSequenceClassification(config=config) + result = model( + input_ids, + bbox=bbox, + pixel_values=pixel_values, + attention_mask=input_mask, + token_type_ids=token_type_ids, + labels=sequence_labels, + training=False, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_token_classification( + self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, token_labels + ): + config.num_labels = self.num_labels + model = TFLayoutLMv3ForTokenClassification(config=config) + result = model( + input_ids, + bbox=bbox, + pixel_values=pixel_values, + attention_mask=input_mask, + token_type_ids=token_type_ids, + labels=token_labels, + training=False, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.text_seq_length, self.num_labels)) + + def create_and_check_for_question_answering( + self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels + ): + config.num_labels = 2 + model = TFLayoutLMv3ForQuestionAnswering(config=config) + result = model( + input_ids, + bbox=bbox, + pixel_values=pixel_values, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + training=False, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, bbox, pixel_values, token_type_ids, input_mask, _, _) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "bbox": bbox, + "pixel_values": pixel_values, + "token_type_ids": token_type_ids, + "attention_mask": input_mask, + } + return config, inputs_dict + + +@require_tf +class TFLayoutLMv3ModelTest(TFModelTesterMixin, unittest.TestCase): + + all_model_classes = ( + ( + TFLayoutLMv3Model, + TFLayoutLMv3ForQuestionAnswering, + TFLayoutLMv3ForSequenceClassification, + TFLayoutLMv3ForTokenClassification, + ) + if is_tf_available() + else () + ) + + test_pruning = False + test_resize_embeddings = False + test_onnx = False + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict: + inputs_dict = copy.deepcopy(inputs_dict) + + if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): + inputs_dict = { + k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1)) + if isinstance(v, tf.Tensor) and v.ndim > 0 + else v + for k, v in inputs_dict.items() + } + + if return_labels: + if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): + inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32) + elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING): + inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + elif model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): + inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + elif model_class in get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING): + inputs_dict["labels"] = tf.zeros( + (self.model_tester.batch_size, self.model_tester.text_seq_length), dtype=tf.int32 + ) + + return inputs_dict + + def setUp(self): + self.model_tester = TFLayoutLMv3ModelTester(self) + self.config_tester = ConfigTester(self, config_class=LayoutLMv3Config, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_loss_computation(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config) + if getattr(model, "hf_compute_loss", None): + # The number of elements in the loss should be the same as the number of elements in the label + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + added_label = prepared_for_class[ + sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0] + ] + expected_loss_size = added_label.shape.as_list()[:1] + + # Test that model correctly compute the loss with kwargs + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + input_ids = prepared_for_class.pop("input_ids") + + loss = model(input_ids, **prepared_for_class)[0] + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) + + # Test that model correctly compute the loss when we mask some positions + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + input_ids = prepared_for_class.pop("input_ids") + if "labels" in prepared_for_class: + labels = prepared_for_class["labels"].numpy() + if len(labels.shape) > 1 and labels.shape[1] != 1: + labels[0] = -100 + prepared_for_class["labels"] = tf.convert_to_tensor(labels) + loss = model(input_ids, **prepared_for_class)[0] + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) + self.assertTrue(not np.any(np.isnan(loss.numpy()))) + + # Test that model correctly compute the loss with a dict + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + loss = model(prepared_for_class)[0] + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) + + # Test that model correctly compute the loss with a tuple + prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) + + # Get keys that were added with the _prepare_for_class function + label_keys = prepared_for_class.keys() - inputs_dict.keys() + signature = inspect.signature(model.call).parameters + signature_names = list(signature.keys()) + + # Create a dictionary holding the location of the tensors in the tuple + tuple_index_mapping = {0: "input_ids"} + for label_key in label_keys: + label_key_index = signature_names.index(label_key) + tuple_index_mapping[label_key_index] = label_key + sorted_tuple_index_mapping = sorted(tuple_index_mapping.items()) + # Initialize a list with their default values, update the values and convert to a tuple + list_input = [] + + for name in signature_names: + if name != "kwargs": + list_input.append(signature[name].default) + + for index, value in sorted_tuple_index_mapping: + list_input[index] = prepared_for_class[value] + + tuple_input = tuple(list_input) + + # Send to model + loss = model(tuple_input[:-1])[0] + + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) + + def test_model(self): + ( + config, + input_ids, + bbox, + pixel_values, + token_type_ids, + input_mask, + _, + _, + ) = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(config, input_ids, bbox, pixel_values, token_type_ids, input_mask) + + def test_model_various_embeddings(self): + ( + config, + input_ids, + bbox, + pixel_values, + token_type_ids, + input_mask, + _, + _, + ) = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config.position_embedding_type = type + self.model_tester.create_and_check_model(config, input_ids, bbox, pixel_values, token_type_ids, input_mask) + + def test_for_sequence_classification(self): + ( + config, + input_ids, + bbox, + pixel_values, + token_type_ids, + input_mask, + sequence_labels, + _, + ) = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification( + config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels + ) + + def test_for_token_classification(self): + ( + config, + input_ids, + bbox, + pixel_values, + token_type_ids, + input_mask, + _, + token_labels, + ) = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification( + config, input_ids, bbox, pixel_values, token_type_ids, input_mask, token_labels + ) + + def test_for_question_answering(self): + ( + config, + input_ids, + bbox, + pixel_values, + token_type_ids, + input_mask, + sequence_labels, + _, + ) = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering( + config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels + ) + + @slow + def test_model_from_pretrained(self): + for model_name in TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TFLayoutLMv3Model.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_tf +class TFLayoutLMv3ModelIntegrationTest(unittest.TestCase): + @cached_property + def default_feature_extractor(self): + return LayoutLMv3FeatureExtractor(apply_ocr=False) if is_vision_available() else None + + @slow + def test_inference_no_head(self): + model = TFLayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base") + + feature_extractor = self.default_feature_extractor + image = prepare_img() + pixel_values = feature_extractor(images=image, return_tensors="tf").pixel_values + + input_ids = tf.constant([[1, 2]]) + bbox = tf.expand_dims(tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]]), axis=0) + + # forward pass + outputs = model(input_ids=input_ids, bbox=bbox, pixel_values=pixel_values, training=False) + + # verify the logits + expected_shape = (1, 199, 768) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + + expected_slice = tf.constant( + [[-0.0529, 0.3618, 0.1632], [-0.1587, -0.1667, -0.0400], [-0.1557, -0.1671, -0.0505]] + ) + + self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)) diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index 0edda8ae5a4c..b03dcf511731 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -38,6 +38,7 @@ src/transformers/models/gptj/modeling_gptj.py src/transformers/models/hubert/modeling_hubert.py src/transformers/models/layoutlmv2/modeling_layoutlmv2.py src/transformers/models/layoutlmv3/modeling_layoutlmv3.py +src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py src/transformers/models/longformer/modeling_longformer.py src/transformers/models/longformer/modeling_tf_longformer.py src/transformers/models/longt5/modeling_longt5.py