From e6270695b7e8b3406eea8d08394085aa109eb4b8 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 10 Mar 2022 14:00:11 +0100 Subject: [PATCH 1/8] Fix duplicate arguments passed to dummy inputs in ONNX export --- src/transformers/onnx/convert.py | 45 +++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py index 42b57d2c5402..a0b4da533498 100644 --- a/src/transformers/onnx/convert.py +++ b/src/transformers/onnx/convert.py @@ -100,11 +100,18 @@ def export_pytorch( `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from the ONNX configuration. """ + from ..tokenization_utils_base import PreTrainedTokenizerBase + + if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: + raise ValueError("You cannot provide both a tokenizer and a preprocessor to generate dummy inputs.") if tokenizer is not None: warnings.warn( "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use `preprocessor` instead.", FutureWarning, ) + logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.") + preprocessor = tokenizer + if issubclass(type(model), PreTrainedModel): import torch from torch.onnx import export as onnx_export @@ -123,9 +130,7 @@ def export_pytorch( # Ensure inputs match # TODO: Check when exporting QA we provide "is_pair=True" - model_inputs = config.generate_dummy_inputs( - preprocessor, tokenizer=tokenizer, framework=TensorType.PYTORCH - ) + model_inputs = config.generate_dummy_inputs(preprocessor, framework=TensorType.PYTORCH) inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys()) onnx_outputs = list(config.outputs.keys()) @@ -213,11 +218,17 @@ def export_tensorflow( import onnx import tf2onnx + from ..tokenization_utils_base import PreTrainedTokenizerBase + + if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: + raise ValueError("You cannot provide both a tokenizer and a preprocessor to generate dummy inputs.") if tokenizer is not None: warnings.warn( "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use `preprocessor` instead.", FutureWarning, ) + logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.") + preprocessor = tokenizer model.config.return_dict = True @@ -229,7 +240,7 @@ def export_tensorflow( setattr(model.config, override_config_key, override_config_value) # Ensure inputs match - model_inputs = config.generate_dummy_inputs(preprocessor, tokenizer=tokenizer, framework=TensorType.TENSORFLOW) + model_inputs = config.generate_dummy_inputs(preprocessor, framework=TensorType.TENSORFLOW) inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys()) onnx_outputs = list(config.outputs.keys()) @@ -273,11 +284,17 @@ def export( "Cannot convert because neither PyTorch nor TensorFlow are not installed. " "Please install torch or tensorflow first." ) + from ..tokenization_utils_base import PreTrainedTokenizerBase + + if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: + raise ValueError("You cannot provide both a tokenizer and a preprocessor to generate dummy inputs.") if tokenizer is not None: warnings.warn( "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use `preprocessor` instead.", FutureWarning, ) + logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.") + preprocessor = tokenizer if is_torch_available(): from ..file_utils import torch_version @@ -307,18 +324,26 @@ def validate_model_outputs( ): from onnxruntime import InferenceSession, SessionOptions + from ..tokenization_utils_base import PreTrainedTokenizerBase + logger.info("Validating ONNX model...") + if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: + raise ValueError("You cannot provide both a tokenizer and a preprocessor to validate model inputs.") + if tokenizer is not None: + warnings.warn( + "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use `preprocessor` instead.", + FutureWarning, + ) + logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.") + preprocessor = tokenizer + # TODO: generate inputs with a different batch_size and seq_len that was used for conversion to properly test # dynamic input shapes. if issubclass(type(reference_model), PreTrainedModel): - reference_model_inputs = config.generate_dummy_inputs( - preprocessor, tokenizer=tokenizer, framework=TensorType.PYTORCH - ) + reference_model_inputs = config.generate_dummy_inputs(preprocessor, framework=TensorType.PYTORCH) else: - reference_model_inputs = config.generate_dummy_inputs( - preprocessor, tokenizer=tokenizer, framework=TensorType.TENSORFLOW - ) + reference_model_inputs = config.generate_dummy_inputs(preprocessor, framework=TensorType.TENSORFLOW) # Create ONNX Runtime session options = SessionOptions() From 0af46e6be235d64b88242faaffcd7446e75810c9 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 10 Mar 2022 14:32:04 +0100 Subject: [PATCH 2/8] Fix logging messages --- src/transformers/onnx/convert.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py index a0b4da533498..28baa1c7771e 100644 --- a/src/transformers/onnx/convert.py +++ b/src/transformers/onnx/convert.py @@ -103,7 +103,7 @@ def export_pytorch( from ..tokenization_utils_base import PreTrainedTokenizerBase if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: - raise ValueError("You cannot provide both a tokenizer and a preprocessor to generate dummy inputs.") + raise ValueError("You cannot provide both a tokenizer and a preprocessor to export the model.") if tokenizer is not None: warnings.warn( "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use `preprocessor` instead.", @@ -221,7 +221,7 @@ def export_tensorflow( from ..tokenization_utils_base import PreTrainedTokenizerBase if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: - raise ValueError("You cannot provide both a tokenizer and a preprocessor to generate dummy inputs.") + raise ValueError("You cannot provide both a tokenizer and preprocessor to export the model.") if tokenizer is not None: warnings.warn( "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use `preprocessor` instead.", @@ -287,7 +287,7 @@ def export( from ..tokenization_utils_base import PreTrainedTokenizerBase if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: - raise ValueError("You cannot provide both a tokenizer and a preprocessor to generate dummy inputs.") + raise ValueError("You cannot provide both a tokenizer and a preprocessor to export the model.") if tokenizer is not None: warnings.warn( "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use `preprocessor` instead.", @@ -329,7 +329,7 @@ def validate_model_outputs( logger.info("Validating ONNX model...") if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: - raise ValueError("You cannot provide both a tokenizer and a preprocessor to validate model inputs.") + raise ValueError("You cannot provide both a tokenizer and a preprocessor to validatethe model outputs.") if tokenizer is not None: warnings.warn( "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use `preprocessor` instead.", From d6e0361f9f17813d21164ffab93522ed48c33543 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 10 Mar 2022 14:32:18 +0100 Subject: [PATCH 3/8] Fix M2M100 ONNX config --- src/transformers/models/m2m_100/configuration_m2m_100.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py index 62a63d248b90..180950f8c7b9 100644 --- a/src/transformers/models/m2m_100/configuration_m2m_100.py +++ b/src/transformers/models/m2m_100/configuration_m2m_100.py @@ -198,13 +198,13 @@ def _generate_dummy_inputs_for_sequence_classification_and_question_answering( # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity. # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX batch_size = compute_effective_axis_dimension( - batch_size, fixed_dimension=OnnxConfig.DEFAULT_FIXED_BATCH, num_token_to_add=0 + batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0 ) # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX token_to_add = tokenizer.num_special_tokens_to_add(is_pair) seq_length = compute_effective_axis_dimension( - seq_length, fixed_dimension=OnnxConfig.DEFAULT_FIXED_SEQUENCE, num_token_to_add=token_to_add + seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add ) # Generate dummy inputs according to compute batch and sequence From abe606da79dd710a43bb9fbe72c137c84d3e5f99 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 10 Mar 2022 16:12:37 +0100 Subject: [PATCH 4/8] Integrate reviewer comments --- src/transformers/onnx/convert.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py index 28baa1c7771e..e47a73aa18c4 100644 --- a/src/transformers/onnx/convert.py +++ b/src/transformers/onnx/convert.py @@ -22,6 +22,7 @@ from packaging.version import Version, parse from ..file_utils import TensorType, is_tf_available, is_torch_available, is_torch_onnx_dict_inputs_support_available +from ..tokenization_utils_base import PreTrainedTokenizerBase from ..utils import logging from .config import OnnxConfig @@ -100,7 +101,6 @@ def export_pytorch( `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from the ONNX configuration. """ - from ..tokenization_utils_base import PreTrainedTokenizerBase if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: raise ValueError("You cannot provide both a tokenizer and a preprocessor to export the model.") @@ -109,7 +109,7 @@ def export_pytorch( "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use `preprocessor` instead.", FutureWarning, ) - logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.") + logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.") preprocessor = tokenizer if issubclass(type(model), PreTrainedModel): @@ -218,8 +218,6 @@ def export_tensorflow( import onnx import tf2onnx - from ..tokenization_utils_base import PreTrainedTokenizerBase - if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: raise ValueError("You cannot provide both a tokenizer and preprocessor to export the model.") if tokenizer is not None: @@ -227,7 +225,7 @@ def export_tensorflow( "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use `preprocessor` instead.", FutureWarning, ) - logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.") + logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.") preprocessor = tokenizer model.config.return_dict = True @@ -284,7 +282,6 @@ def export( "Cannot convert because neither PyTorch nor TensorFlow are not installed. " "Please install torch or tensorflow first." ) - from ..tokenization_utils_base import PreTrainedTokenizerBase if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: raise ValueError("You cannot provide both a tokenizer and a preprocessor to export the model.") @@ -293,7 +290,7 @@ def export( "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use `preprocessor` instead.", FutureWarning, ) - logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.") + logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.") preprocessor = tokenizer if is_torch_available(): @@ -324,8 +321,6 @@ def validate_model_outputs( ): from onnxruntime import InferenceSession, SessionOptions - from ..tokenization_utils_base import PreTrainedTokenizerBase - logger.info("Validating ONNX model...") if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: From ee6a1e0195b76620aab7521d8eeb3623a21f9684 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 10 Mar 2022 16:13:15 +0100 Subject: [PATCH 5/8] Ensure we check PreTrained model only if torch is available --- src/transformers/onnx/convert.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py index e47a73aa18c4..cb646948a821 100644 --- a/src/transformers/onnx/convert.py +++ b/src/transformers/onnx/convert.py @@ -330,12 +330,12 @@ def validate_model_outputs( "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use `preprocessor` instead.", FutureWarning, ) - logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.") + logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.") preprocessor = tokenizer # TODO: generate inputs with a different batch_size and seq_len that was used for conversion to properly test # dynamic input shapes. - if issubclass(type(reference_model), PreTrainedModel): + if is_torch_available() and issubclass(type(reference_model), PreTrainedModel): reference_model_inputs = config.generate_dummy_inputs(preprocessor, framework=TensorType.PYTORCH) else: reference_model_inputs = config.generate_dummy_inputs(preprocessor, framework=TensorType.TENSORFLOW) @@ -388,7 +388,7 @@ def validate_model_outputs( # Check the shape and values match for name, ort_value in zip(onnx_named_outputs, onnx_outputs): - if issubclass(type(reference_model), PreTrainedModel): + if is_torch_available() and issubclass(type(reference_model), PreTrainedModel): ref_value = ref_outputs_dict[name].detach().numpy() else: ref_value = ref_outputs_dict[name].numpy() @@ -422,7 +422,7 @@ def ensure_model_and_config_inputs_match( :param model_inputs: :param config_inputs: :return: """ - if issubclass(type(model), PreTrainedModel): + if is_torch_available() and issubclass(type(model), PreTrainedModel): forward_parameters = signature(model.forward).parameters else: forward_parameters = signature(model.call).parameters From 8f9558612c6f34debaa2e95c731f8559756ae81b Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 10 Mar 2022 17:42:51 +0100 Subject: [PATCH 6/8] Remove TensorFlow tests for models without PyTorch parity --- tests/onnx/test_onnx_v2.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py index a0a5e0f943a5..103bec4881d6 100644 --- a/tests/onnx/test_onnx_v2.py +++ b/tests/onnx/test_onnx_v2.py @@ -196,15 +196,12 @@ def test_values_override(self): ("m2m-100", "facebook/m2m100_418M"), } +# TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_MODELS` once TensorFlow has parity with the PyTorch model implementations. TENSORFLOW_EXPORT_DEFAULT_MODELS = { ("albert", "hf-internal-testing/tiny-albert"), ("bert", "bert-base-cased"), - ("ibert", "kssteven/ibert-roberta-base"), - ("camembert", "camembert-base"), ("distilbert", "distilbert-base-cased"), ("roberta", "roberta-base"), - ("xlm-roberta", "xlm-roberta-base"), - ("layoutlm", "microsoft/layoutlm-base-uncased"), } TENSORFLOW_EXPORT_WITH_PAST_MODELS = { @@ -212,12 +209,8 @@ def test_values_override(self): ("gpt-neo", "EleutherAI/gpt-neo-125M"), } -TENSORFLOW_EXPORT_SEQ2SEQ_WITH_PAST_MODELS = { - ("bart", "facebook/bart-base"), - ("mbart", "sshleifer/tiny-mbart"), - ("t5", "t5-small"), - ("marian", "Helsinki-NLP/opus-mt-en-de"), -} +# TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_SEQ2SEQ_WITH_PAST_MODELS` once TensorFlow has parity with the PyTorch model implementations. +TENSORFLOW_EXPORT_SEQ2SEQ_WITH_PAST_MODELS = {} def _get_models_to_test(export_models_list): @@ -318,7 +311,7 @@ def test_tensorflow_export(self, test_name, name, model_name, feature, onnx_conf def test_tensorflow_export_with_past(self, test_name, name, model_name, feature, onnx_config_class_constructor): self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor) - @parameterized.expand(_get_models_to_test(TENSORFLOW_EXPORT_SEQ2SEQ_WITH_PAST_MODELS)) + @parameterized.expand(_get_models_to_test(TENSORFLOW_EXPORT_SEQ2SEQ_WITH_PAST_MODELS), skip_on_empty=True) @slow @require_tf def test_tensorflow_export_seq2seq_with_past( From c748f0040a4648acbae6883c406531cf22bf861c Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 10 Mar 2022 18:12:23 +0100 Subject: [PATCH 7/8] Remove GPT-Neo from TF tests --- tests/onnx/test_onnx_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py index 103bec4881d6..bf5b0f0a7954 100644 --- a/tests/onnx/test_onnx_v2.py +++ b/tests/onnx/test_onnx_v2.py @@ -204,9 +204,9 @@ def test_values_override(self): ("roberta", "roberta-base"), } +# TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_WITH_PAST_MODELS` once TensorFlow has parity with the PyTorch model implementations. TENSORFLOW_EXPORT_WITH_PAST_MODELS = { ("gpt2", "gpt2"), - ("gpt-neo", "EleutherAI/gpt-neo-125M"), } # TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_SEQ2SEQ_WITH_PAST_MODELS` once TensorFlow has parity with the PyTorch model implementations. From 6c28992c2141faddcbdbb1ee9ff723b51710e339 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 10 Mar 2022 18:26:49 +0100 Subject: [PATCH 8/8] Remove GPT-2 from TF ONNX tests --- tests/onnx/test_onnx_v2.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py index bf5b0f0a7954..26ef4370e272 100644 --- a/tests/onnx/test_onnx_v2.py +++ b/tests/onnx/test_onnx_v2.py @@ -205,9 +205,7 @@ def test_values_override(self): } # TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_WITH_PAST_MODELS` once TensorFlow has parity with the PyTorch model implementations. -TENSORFLOW_EXPORT_WITH_PAST_MODELS = { - ("gpt2", "gpt2"), -} +TENSORFLOW_EXPORT_WITH_PAST_MODELS = {} # TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_SEQ2SEQ_WITH_PAST_MODELS` once TensorFlow has parity with the PyTorch model implementations. TENSORFLOW_EXPORT_SEQ2SEQ_WITH_PAST_MODELS = {} @@ -305,7 +303,7 @@ def test_pytorch_export_seq2seq_with_past( def test_tensorflow_export(self, test_name, name, model_name, feature, onnx_config_class_constructor): self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor) - @parameterized.expand(_get_models_to_test(TENSORFLOW_EXPORT_WITH_PAST_MODELS)) + @parameterized.expand(_get_models_to_test(TENSORFLOW_EXPORT_WITH_PAST_MODELS), skip_on_empty=True) @slow @require_tf def test_tensorflow_export_with_past(self, test_name, name, model_name, feature, onnx_config_class_constructor):