From 022cea834ee4971a004d241fe50454537aa2a48b Mon Sep 17 00:00:00 2001
From: regisss <regis@huggingface.co>
Date: Wed, 29 Jun 2022 20:57:33 +0200
Subject: [PATCH 1/4] Add ONNX support for LayoutLMv3

---
 docs/source/en/serialization.mdx              |   1 +
 .../models/layoutlmv3/__init__.py             |  12 +-
 .../layoutlmv3/configuration_layoutlmv3.py    | 114 ++++++++++++++++++
 src/transformers/onnx/features.py             |   7 ++
 tests/onnx/test_onnx_v2.py                    |   1 +
 5 files changed, 133 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index f5c113e31019..034d4c1bf652 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -70,6 +70,7 @@ Ready-made configurations include the following architectures:
 - GPT-J
 - I-BERT
 - LayoutLM
+- LayoutLMv3
 - LongT5
 - M2M100
 - Marian
diff --git a/src/transformers/models/layoutlmv3/__init__.py b/src/transformers/models/layoutlmv3/__init__.py
index a7d104040bd6..cfa26057e87b 100644
--- a/src/transformers/models/layoutlmv3/__init__.py
+++ b/src/transformers/models/layoutlmv3/__init__.py
@@ -28,7 +28,11 @@
 
 
 _import_structure = {
-    "configuration_layoutlmv3": ["LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMv3Config"],
+    "configuration_layoutlmv3": [
+        "LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "LayoutLMv3Config",
+        "LayoutLMv3OnnxConfig",
+    ],
     "processing_layoutlmv3": ["LayoutLMv3Processor"],
     "tokenization_layoutlmv3": ["LayoutLMv3Tokenizer"],
 }
@@ -66,7 +70,11 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_layoutlmv3 import LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMv3Config
+    from .configuration_layoutlmv3 import (
+        LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        LayoutLMv3Config,
+        LayoutLMv3OnnxConfig,
+    )
     from .processing_layoutlmv3 import LayoutLMv3Processor
     from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer
 
diff --git a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
index ebde107947a1..220631e07652 100644
--- a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
@@ -14,7 +14,17 @@
 # limitations under the License.
 """ LayoutLMv3 model configuration"""
 
+from collections import OrderedDict
+from typing import Any, Mapping, Optional
+
+from packaging import version
+
+from transformers import TensorType
+
 from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...onnx.utils import compute_effective_axis_dimension
+from ...processing_utils import ProcessorMixin
 from ...utils import logging
 
 
@@ -176,3 +186,107 @@ def __init__(
         self.num_channels = num_channels
         self.patch_size = patch_size
         self.classifier_dropout = classifier_dropout
+
+
+class LayoutLMv3OnnxConfig(OnnxConfig):
+
+    torch_onnx_minimum_version = version.parse("1.12")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        # The order of inputs is different for question answering and sequence classification
+        if self.task in ["question-answering", "sequence-classification"]:
+            return OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "sequence"}),
+                    ("attention_mask", {0: "batch", 1: "sequence"}),
+                    ("bbox", {0: "batch", 1: "sequence"}),
+                    ("pixel_values", {0: "batch", 1: "sequence"}),
+                ]
+            )
+        else:
+            return OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "sequence"}),
+                    ("bbox", {0: "batch", 1: "sequence"}),
+                    ("attention_mask", {0: "batch", 1: "sequence"}),
+                    ("pixel_values", {0: "batch", 1: "sequence"}),
+                ]
+            )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-5
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
+
+    def generate_dummy_inputs(
+        self,
+        processor: ProcessorMixin,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+        num_channels: int = 3,
+        image_width: int = 40,
+        image_height: int = 40,
+    ) -> Mapping[str, Any]:
+        """
+        Generate inputs to provide to the ONNX exporter for the specific framework
+
+        Args:
+            processor ([`ProcessorMixin`]):
+                The processor associated with this model configuration.
+            batch_size (`int`, *optional*, defaults to -1):
+                The batch size to export the model for (-1 means dynamic axis).
+            seq_length (`int`, *optional*, defaults to -1):
+                The sequence length to export the model for (-1 means dynamic axis).
+            is_pair (`bool`, *optional*, defaults to `False`):
+                Indicate if the input is a pair (sentence 1, sentence 2).
+            framework (`TensorType`, *optional*, defaults to `None`):
+                The framework (PyTorch or TensorFlow) that the processor will generate tensors for.
+            num_channels (`int`, *optional*, defaults to 3):
+                The number of channels of the generated images.
+            image_width (`int`, *optional*, defaults to 40):
+                The width of the generated images.
+            image_height (`int`, *optional*, defaults to 40):
+                The height of the generated images.
+
+        Returns:
+            Mapping[str, Any]: _description_
+        """
+
+        # A dummy image is used so OCR should not be applied
+        setattr(processor.feature_extractor, "apply_ocr", False)
+
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        batch_size = compute_effective_axis_dimension(
+            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
+        )
+        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
+        token_to_add = processor.tokenizer.num_special_tokens_to_add(is_pair)
+        seq_length = compute_effective_axis_dimension(
+            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
+        )
+        # Generate dummy inputs according to compute batch and sequence
+        dummy_text = [[" ".join([processor.tokenizer.unk_token]) * seq_length]] * batch_size
+
+        # Generate dummy bounding boxes
+        dummy_bboxes = [[[48, 84, 73, 128]]] * batch_size
+
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        # batch_size = compute_effective_axis_dimension(batch_size, fixed_dimension=OnnxConfig.default_fixed_batch)
+        dummy_image = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)
+
+        inputs = dict(
+            processor(
+                dummy_image,
+                text=dummy_text,
+                boxes=dummy_bboxes,
+                return_tensors=framework,
+            )
+        )
+
+        return inputs
diff --git a/src/transformers/onnx/features.py b/src/transformers/onnx/features.py
index c37c12ca2a89..86b241e68f71 100644
--- a/src/transformers/onnx/features.py
+++ b/src/transformers/onnx/features.py
@@ -313,6 +313,13 @@ class FeaturesManager:
             "token-classification",
             onnx_config_cls="models.layoutlm.LayoutLMOnnxConfig",
         ),
+        "layoutlmv3": supported_features_mapping(
+            "default",
+            "question-answering",
+            "sequence-classification",
+            "token-classification",
+            onnx_config_cls="models.layoutlmv3.LayoutLMv3OnnxConfig",
+        ),
         "longt5": supported_features_mapping(
             "default",
             "default-with-past",
diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
index 50601598f5aa..f3e5aef20f39 100644
--- a/tests/onnx/test_onnx_v2.py
+++ b/tests/onnx/test_onnx_v2.py
@@ -195,6 +195,7 @@ def test_values_override(self):
     ("xlm", "xlm-clm-ende-1024"),
     ("xlm-roberta", "xlm-roberta-base"),
     ("layoutlm", "microsoft/layoutlm-base-uncased"),
+    ("layoutlmv3", "microsoft/layoutlmv3-base"),
     ("vit", "google/vit-base-patch16-224"),
     ("deit", "facebook/deit-small-patch16-224"),
     ("beit", "microsoft/beit-base-patch16-224"),

From e32dc24440b0f8f4239ba342ec689fd1c1118e14 Mon Sep 17 00:00:00 2001
From: regisss <regis@huggingface.co>
Date: Wed, 29 Jun 2022 22:45:09 +0200
Subject: [PATCH 2/4] Update docstrings

---
 src/transformers/onnx/convert.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py
index c8f75f2fd3df..d4deb796683d 100644
--- a/src/transformers/onnx/convert.py
+++ b/src/transformers/onnx/convert.py
@@ -40,6 +40,7 @@
 
 if TYPE_CHECKING:
     from ..feature_extraction_utils import FeatureExtractionMixin
+    from ..processing_utils import ProcessorMixin
     from ..tokenization_utils import PreTrainedTokenizer
 
 
@@ -80,7 +81,7 @@ def check_onnxruntime_requirements(minimum_version: Version):
 
 
 def export_pytorch(
-    preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin"],
+    preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin", "ProcessorMixin"],
     model: "PreTrainedModel",
     config: OnnxConfig,
     opset: int,
@@ -92,7 +93,7 @@ def export_pytorch(
     Export a PyTorch model to an ONNX Intermediate Representation (IR)
 
     Args:
-        preprocessor: ([`PreTrainedTokenizer`] or [`FeatureExtractionMixin`]):
+        preprocessor: ([`PreTrainedTokenizer`], [`FeatureExtractionMixin`] or [`ProcessorMixin`]):
             The preprocessor used for encoding the data.
         model ([`PreTrainedModel`]):
             The model to export.
@@ -269,7 +270,7 @@ def export_tensorflow(
 
 
 def export(
-    preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin"],
+    preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin", "ProcessorMixin"],
     model: Union["PreTrainedModel", "TFPreTrainedModel"],
     config: OnnxConfig,
     opset: int,
@@ -281,7 +282,7 @@ def export(
     Export a Pytorch or TensorFlow model to an ONNX Intermediate Representation (IR)
 
     Args:
-        preprocessor: ([`PreTrainedTokenizer`] or [`FeatureExtractionMixin`]):
+        preprocessor: ([`PreTrainedTokenizer`], [`FeatureExtractionMixin`] or [`ProcessorMixin`]):
             The preprocessor used for encoding the data.
         model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
             The model to export.
@@ -339,7 +340,7 @@ def export(
 
 def validate_model_outputs(
     config: OnnxConfig,
-    preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin"],
+    preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin", "ProcessorMixin"],
     reference_model: Union["PreTrainedModel", "TFPreTrainedModel"],
     onnx_model: Path,
     onnx_named_outputs: List[str],

From 771c85c8c5fc8dd092e6fa5e3744ed7ce0df992b Mon Sep 17 00:00:00 2001
From: regisss <regis@huggingface.co>
Date: Wed, 29 Jun 2022 22:54:39 +0200
Subject: [PATCH 3/4] Update empty description in docstring

---
 src/transformers/models/layoutlmv3/configuration_layoutlmv3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
index 220631e07652..7c6119ad0996 100644
--- a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
@@ -255,7 +255,7 @@ def generate_dummy_inputs(
                 The height of the generated images.
 
         Returns:
-            Mapping[str, Any]: _description_
+            Mapping[str, Any]: holding the kwargs to provide to the model's forward function
         """
 
         # A dummy image is used so OCR should not be applied

From fdc9d9866466d875d9fdfa83b233bdf4d3fbfc51 Mon Sep 17 00:00:00 2001
From: regisss <regis@huggingface.co>
Date: Thu, 30 Jun 2022 17:37:38 +0200
Subject: [PATCH 4/4] Fix imports and type hints

---
 .../models/layoutlmv3/configuration_layoutlmv3.py  | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
index 7c6119ad0996..d9ddde6289c9 100644
--- a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
@@ -15,19 +15,21 @@
 """ LayoutLMv3 model configuration"""
 
 from collections import OrderedDict
-from typing import Any, Mapping, Optional
+from typing import TYPE_CHECKING, Any, Mapping, Optional
 
 from packaging import version
 
-from transformers import TensorType
-
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...onnx.utils import compute_effective_axis_dimension
-from ...processing_utils import ProcessorMixin
 from ...utils import logging
 
 
+if TYPE_CHECKING:
+    from ...processing_utils import ProcessorMixin
+    from ...utils import TensorType
+
+
 logger = logging.get_logger(__name__)
 
 LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
@@ -224,11 +226,11 @@ def default_onnx_opset(self) -> int:
 
     def generate_dummy_inputs(
         self,
-        processor: ProcessorMixin,
+        processor: "ProcessorMixin",
         batch_size: int = -1,
         seq_length: int = -1,
         is_pair: bool = False,
-        framework: Optional[TensorType] = None,
+        framework: Optional["TensorType"] = None,
         num_channels: int = 3,
         image_width: int = 40,
         image_height: int = 40,