From e77d866a87f428fb1f4816c60ebffb5cf14064e9 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 8 Dec 2020 16:26:11 -0500
Subject: [PATCH] Fix remaining tests

---
 .../models/tapas/modeling_tapas.py            |  58 +++---
 .../models/tapas/tokenization_tapas.py        |  33 ++--
 tests/test_modeling_tapas.py                  |  79 ++++++--
 tests/test_tokenization_common.py             |   4 +-
 tests/test_tokenization_tapas.py              | 175 +++++++++++++++++-
 5 files changed, 276 insertions(+), 73 deletions(-)

diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index 6a4167b8a59d..147eea6d4c6d 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -70,7 +70,7 @@ class TableQuestionAnsweringOutput(ModelOutput):
     Output type of :class:`~transformers.TapasForQuestionAnswering`.
 
     Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label_ids` (and possibly :obj:`answer`, :obj:`aggregation_labels`, :obj:`numeric_values` and :obj:`numeric_values_scale` are provided)):
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` (and possibly :obj:`answer`, :obj:`aggregation_labels`, :obj:`numeric_values` and :obj:`numeric_values_scale` are provided)):
             Total loss as the sum of the hierarchical cell selection log-likelihood loss and (optionally) the
             semi-supervised regression loss and (optionally) supervised loss for aggregations.
         logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
@@ -1018,7 +1018,7 @@ def forward(self, features, **kwargs):
     TAPAS_START_DOCSTRING,
 )
 class TapasForQuestionAnswering(TapasPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config: TapasConfig):
         super().__init__(config)
 
         # base model
@@ -1036,11 +1036,11 @@ def __init__(self, config):
         else:
             self.output_weights = nn.Parameter(torch.empty(config.hidden_size))
             nn.init.normal_(
-                self.output_weights, std=0.02
+                self.output_weights, std=config.initializer_range
             )  # here, a truncated normal is used in the original implementation
             self.column_output_weights = nn.Parameter(torch.empty(config.hidden_size))
             nn.init.normal_(
-                self.column_output_weights, std=0.02
+                self.column_output_weights, std=config.initializer_range
             )  # here, a truncated normal is used in the original implementation
         self.output_bias = nn.Parameter(torch.zeros([]))
         self.column_output_bias = nn.Parameter(torch.zeros([]))
@@ -1062,7 +1062,7 @@ def forward(
         head_mask=None,
         inputs_embeds=None,
         table_mask=None,
-        label_ids=None,
+        labels=None,
         aggregation_labels=None,
         float_answer=None,
         numeric_values=None,
@@ -1075,7 +1075,7 @@ def forward(
         table_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, seq_length)`, `optional`):
             Mask for the table. Indicates which tokens belong to the table (1). Question tokens, table headers and
             padding are 0.
-        label_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, seq_length)`, `optional`):
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, seq_length)`, `optional`):
             Labels per token for computing the hierarchical cell selection loss. This encodes the positions of the
             answer appearing in the table. Can be obtained using :class:`~transformers.TapasTokenizer`. 
 
@@ -1156,7 +1156,7 @@ def forward(
             "segment_ids",
             "column_ids",
             "row_ids",
-            "prev_label_ids",
+            "prev_labels",
             "column_ranks",
             "inv_column_ranks",
             "numeric_relations",
@@ -1214,7 +1214,7 @@ def forward(
         # Total loss calculation
         total_loss = 0.0
         calculate_loss = False
-        if label_ids is not None:
+        if labels is not None:
             calculate_loss = True
             is_supervised = not self.config.num_aggregation_labels > 0 or not self.config.use_answer_as_supervision
 
@@ -1226,18 +1226,18 @@ def forward(
             # some ambiguous cases, see utils._calculate_aggregate_mask for more info.
             # `aggregate_mask` is 1 for examples where we chose to aggregate and 0
             #  for examples where we chose to select the answer directly.
-            # `label_ids` encodes the positions of the answer appearing in the table.
+            # `labels` encodes the positions of the answer appearing in the table.
             if is_supervised:
                 aggregate_mask = None
             else:
                 if float_answer is not None:
-                    assert label_ids.shape[0] == float_answer.shape[0], "Make sure the answers are a FloatTensor of shape (batch_size,)"
+                    assert labels.shape[0] == float_answer.shape[0], "Make sure the answers are a FloatTensor of shape (batch_size,)"
                     # <float32>[batch_size]
                     aggregate_mask = _calculate_aggregate_mask(
                         float_answer,
                         pooled_output,
                         self.config.cell_selection_preference,
-                        label_ids,
+                        labels,
                         self.aggregation_classifier,
                     )
                 else:
@@ -1255,17 +1255,17 @@ def forward(
             selection_loss_per_example = None
             if not self.config.select_one_column:
                 weight = torch.where(
-                    label_ids == 0,
-                    torch.ones_like(label_ids, dtype=torch.float32),
-                    self.config.positive_label_weight * torch.ones_like(label_ids, dtype=torch.float32),
+                    labels == 0,
+                    torch.ones_like(labels, dtype=torch.float32),
+                    self.config.positive_label_weight * torch.ones_like(labels, dtype=torch.float32),
                 )
-                selection_loss_per_token = -dist_per_token.log_prob(label_ids) * weight
+                selection_loss_per_token = -dist_per_token.log_prob(labels) * weight
                 selection_loss_per_example = torch.sum(selection_loss_per_token * input_mask_float, dim=1) / (
                     torch.sum(input_mask_float, dim=1) + EPSILON_ZERO_DIVISION
                 )
             else:
                 selection_loss_per_example, logits = _single_column_cell_selection_loss(
-                    logits, column_logits, label_ids, cell_index, col_index, cell_mask
+                    logits, column_logits, labels, cell_index, col_index, cell_mask
                 )
                 dist_per_token = torch.distributions.Bernoulli(logits=logits)
 
@@ -1285,7 +1285,7 @@ def forward(
                 if is_supervised:
                     # Note that `aggregate_mask` is None if the setting is supervised.
                     if aggregation_labels is not None:
-                        assert label_ids.shape[0] == aggregation_labels.shape[0], "Make sure the aggregation labels are a LongTensor of shape (batch_size,)"
+                        assert labels.shape[0] == aggregation_labels.shape[0], "Make sure the aggregation labels are a LongTensor of shape (batch_size,)"
                         per_example_additional_loss = _calculate_aggregation_loss(
                             logits_aggregation, aggregate_mask, aggregation_labels, 
                             self.config.use_answer_as_supervision, self.config.num_aggregation_labels,
@@ -1297,7 +1297,7 @@ def forward(
                         )
                 else:
                     # Set aggregation labels to zeros
-                    aggregation_labels = torch.zeros(label_ids.shape[0], dtype=torch.long, device=label_ids.device)
+                    aggregation_labels = torch.zeros(labels.shape[0], dtype=torch.long, device=labels.device)
                     per_example_additional_loss = _calculate_aggregation_loss(
                         logits_aggregation, aggregate_mask, aggregation_labels, 
                         self.config.use_answer_as_supervision, self.config.num_aggregation_labels,
@@ -1330,16 +1330,16 @@ def forward(
 
         else:
             # if no label ids are provided, set them to zeros in order to properly compute logits
-            label_ids = torch.zeros_like(logits)
+            labels = torch.zeros_like(logits)
             _, logits = _single_column_cell_selection_loss(
-                logits, column_logits, label_ids, cell_index, col_index, cell_mask
+                logits, column_logits, labels, cell_index, col_index, cell_mask
             )
         if not return_dict:
             output = (logits, logits_aggregation) + outputs[2:]
             return ((total_loss,) + output) if calculate_loss else output
 
         return TableQuestionAnsweringOutput(
-            loss=total_loss,
+            loss=total_loss if calculate_loss else None,
             logits=logits,
             logits_aggregation=logits_aggregation,
             hidden_states=outputs.hidden_states,
@@ -1854,7 +1854,7 @@ def compute_column_logits(
     return column_logits
 
 
-def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, cell_index, col_index, cell_mask):
+def _single_column_cell_selection_loss(token_logits, column_logits, labels, cell_index, col_index, cell_mask):
     """
     Computes the loss for cell selection constrained to a single column. The loss is a hierarchical log-likelihood. The
     model first predicts a column and then selects cells within that column (conditioned on the column). Cells outside
@@ -1865,7 +1865,7 @@ def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, c
             Tensor containing the logits per token.
         column_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, max_num_cols)`):
             Tensor containing the logits per column.
-        label_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
             Labels per token.
         cell_index (:obj:`ProductIndexMap`):
             Index that groups tokens into cells.
@@ -1885,7 +1885,7 @@ def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, c
     # First find the column we should select. We use the column with maximum
     # number of selected cells.
     labels_per_column, _ = reduce_sum(
-        torch.as_tensor(label_ids, dtype=torch.float32, device=label_ids.device), col_index
+        torch.as_tensor(labels, dtype=torch.float32, device=labels.device), col_index
     )
     # shape of labels_per_column is (batch_size, max_num_cols). It contains the number of label ids for every column, for every example
     column_label = torch.argmax(labels_per_column, dim=-1)  # shape (batch_size,)
@@ -1894,7 +1894,7 @@ def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, c
     no_cell_selected = torch.eq(
         torch.max(labels_per_column, dim=-1)[0], 0
     )  # no_cell_selected is of shape (batch_size,) and equals True
-    # if an example of the batch has no cells selected (i.e. if there are no label_ids set to 1 for that example)
+    # if an example of the batch has no cells selected (i.e. if there are no labels set to 1 for that example)
     column_label = torch.where(
         no_cell_selected.view(column_label.size()), torch.zeros_like(column_label), column_label
     )
@@ -1909,7 +1909,7 @@ def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, c
     logits_per_cell, _ = reduce_mean(token_logits, cell_index)
     # labels_per_cell: shape (batch_size, 64*32), indicating whether each cell should be selected (1) or not (0)
     labels_per_cell, labels_index = reduce_max(
-        torch.as_tensor(label_ids, dtype=torch.long, device=label_ids.device), cell_index
+        torch.as_tensor(labels, dtype=torch.long, device=labels.device), cell_index
     )
 
     # Mask for the selected column.
@@ -1986,7 +1986,7 @@ def compute_token_logits(sequence_output, temperature, output_weights, output_bi
     return logits
 
 
-def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference, label_ids, aggregation_classifier):
+def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference, labels, aggregation_classifier):
     """
     Finds examples where the model should select cells with no aggregation.
 
@@ -2004,7 +2004,7 @@ def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference,
             Output of the pooler (BertPooler) on top of the encoder layer.
         cell_selection_preference (:obj:`float`):
             Preference for cell selection in ambiguous cases.
-        label_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
             Labels per token. aggregation_classifier (:obj:`torch.nn.Linear`): Aggregation head
 
     Returns:
@@ -2022,7 +2022,7 @@ def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference,
     is_pred_cell_selection = aggregation_ops_total_mass <= cell_selection_preference
 
     # Examples with non-empty cell selection supervision.
-    is_cell_supervision_available = torch.sum(label_ids, dim=1) > 0
+    is_cell_supervision_available = torch.sum(labels, dim=1) > 0
 
     # torch.where is not equivalent to tf.where (in tensorflow 1)
     # hence the added .view on the condition to match the shape of the first tensor
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index f4b8cc8e858e..a445892a3587 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -64,12 +64,6 @@
 }
 
 
-PRETRAINED_INIT_CONFIGURATION = {
-    "nielsr/tapas-base-finetuned-sqa": {"do_lower_case": True},
-    "nielsr/tapas-base-finetuned-wtq": {"do_lower_case": True},
-    "nielsr/tapas-base-finetuned-wikisql-supervised": {"do_lower_case": True},
-}
-
 
 class TapasTruncationStrategy(ExplicitEnum):
     """
@@ -178,7 +172,7 @@ class TapasTokenizer(PreTrainedTokenizer):
     Users should refer to this superclass for more information regarding those methods.
     :class:`~transformers.TapasTokenizer` creates several token type ids to encode tabular structure. To be more
     precise, it adds 7 token type ids, in the following order: :obj:`segment_ids`, :obj:`column_ids`, :obj:`row_ids`,
-    :obj:`prev_label_ids`, :obj:`column_ranks`, :obj:`inv_column_ranks` and :obj:`numeric_relations`:
+    :obj:`prev_labels`, :obj:`column_ranks`, :obj:`inv_column_ranks` and :obj:`numeric_relations`:
 
     - segment_ids: indicate whether a token belongs to the question (0) or the table (1). 0 for special tokens and
       padding.
@@ -186,7 +180,7 @@ class TapasTokenizer(PreTrainedTokenizer):
       tokens, special tokens and padding.
     - row_ids: indicate to which row of the table a token belongs (starting from 1). Is 0 for all question tokens,
       special tokens and padding. Tokens of column headers are also 0.
-    - prev_label_ids: indicate whether a token was (part of) an answer to the previous question (1) or not (0). Useful
+    - prev_labels: indicate whether a token was (part of) an answer to the previous question (1) or not (0). Useful
       in a conversational setup (such as SQA).
     - column_ranks: indicate the rank of a table token relative to a column, if applicable. For example, if you have a
       column "number of movies" with values 87, 53 and 69, then the column ranks of these tokens are 3, 1 and 2 respectively. 
@@ -252,7 +246,6 @@ class TapasTokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
 
     def __init__(
         self,
@@ -1153,10 +1146,10 @@ def prepare_for_model(
         column_ids = self.create_column_token_type_ids_from_sequences(query_ids, table_data)
         row_ids = self.create_row_token_type_ids_from_sequences(query_ids, table_data)
         if not is_part_of_batch or (prev_answer_coordinates is None and prev_answer_text is None):
-            # simply set the prev_label_ids to zeros
-            prev_label_ids = [0] * len(row_ids)
+            # simply set the prev_labels to zeros
+            prev_labels = [0] * len(row_ids)
         else:
-            prev_label_ids = self.get_answer_ids(
+            prev_labels = self.get_answer_ids(
                 column_ids, row_ids, table_data, prev_answer_text, prev_answer_coordinates
             )
 
@@ -1185,13 +1178,13 @@ def prepare_for_model(
             encoded_inputs["attention_mask"] = attention_mask
 
         if answer_coordinates is not None and answer_text is not None:
-            label_ids = self.get_answer_ids(
+            labels = self.get_answer_ids(
                 column_ids, row_ids, table_data, answer_text, answer_coordinates
             )
             numeric_values = self._get_numeric_values(raw_table, column_ids, row_ids)
             numeric_values_scale = self._get_numeric_values_scale(raw_table, column_ids, row_ids)
 
-            encoded_inputs["label_ids"] = label_ids
+            encoded_inputs["labels"] = labels
             encoded_inputs["numeric_values"] = numeric_values
             encoded_inputs["numeric_values_scale"] = numeric_values_scale
 
@@ -1200,7 +1193,7 @@ def prepare_for_model(
                 segment_ids,
                 column_ids,
                 row_ids,
-                prev_label_ids,
+                prev_labels,
                 column_ranks,
                 inv_column_ranks,
                 numeric_relations,
@@ -1829,8 +1822,8 @@ def _pad(
                     encoded_inputs["token_type_ids"] = (
                         encoded_inputs["token_type_ids"] + [[self.pad_token_type_id] * 7] * difference
                     )
-                if "label_ids" in encoded_inputs:
-                    encoded_inputs["label_ids"] = encoded_inputs["label_ids"] + [0] * difference
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = encoded_inputs["labels"] + [0] * difference
                 if "numeric_values" in encoded_inputs:
                     encoded_inputs["numeric_values"] = encoded_inputs["numeric_values"] + [float("nan")] * difference
                 if "numeric_values_scale" in encoded_inputs:
@@ -1845,8 +1838,8 @@ def _pad(
                     encoded_inputs["token_type_ids"] = [[self.pad_token_type_id] * 7] * difference + encoded_inputs[
                         "token_type_ids"
                     ]
-                if "label_ids" in encoded_inputs:
-                    encoded_inputs["label_ids"] = [0] * difference + encoded_inputs["label_ids"] 
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = [0] * difference + encoded_inputs["labels"]
                 if "numeric_values" in encoded_inputs:
                     encoded_inputs["numeric_values"] = [float("nan")] * difference + encoded_inputs["numeric_values"] 
                 if "numeric_values_scale" in encoded_inputs:
@@ -1918,7 +1911,7 @@ def convert_logits_to_predictions(
             "segment_ids",
             "column_ids",
             "row_ids",
-            "prev_label_ids",
+            "prev_labels",
             "column_ranks",
             "inv_column_ranks",
             "numeric_relations",
diff --git a/tests/test_modeling_tapas.py b/tests/test_modeling_tapas.py
index 5e35c28e274b..160b33f22516 100644
--- a/tests/test_modeling_tapas.py
+++ b/tests/test_modeling_tapas.py
@@ -21,7 +21,10 @@
 import numpy as np
 import pandas as pd
 
-from transformers import is_torch_available
+from transformers import is_torch_available, MODEL_FOR_MULTIPLE_CHOICE_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, \
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, \
+    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING, MODEL_FOR_MASKED_LM_MAPPING, \
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch, require_scatter, slow, torch_device
 
@@ -29,6 +32,7 @@
 from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
 
 
+
 if is_torch_available():
     import torch
 
@@ -162,7 +166,7 @@ def prepare_config_and_inputs(self):
 
         sequence_labels = None
         token_labels = None
-        label_ids = None
+        labels = None
         answer = None
         numeric_values = None
         numeric_values_scale = None
@@ -171,7 +175,7 @@ def prepare_config_and_inputs(self):
         if self.use_labels:
             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size).to(torch_device)
             token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels).to(torch_device)
-            label_ids = ids_tensor([self.batch_size, self.seq_length], vocab_size=2).to(torch_device)
+            labels = ids_tensor([self.batch_size, self.seq_length], vocab_size=2).to(torch_device)
             numeric_values = floats_tensor([self.batch_size, self.seq_length]).to(torch_device)
             numeric_values_scale = floats_tensor([self.batch_size, self.seq_length]).to(torch_device)
             float_answer = floats_tensor([self.batch_size]).to(torch_device)
@@ -221,7 +225,7 @@ def prepare_config_and_inputs(self):
             token_type_ids,
             sequence_labels,
             token_labels,
-            label_ids,
+            labels,
             numeric_values,
             numeric_values_scale,
             float_answer,
@@ -236,7 +240,7 @@ def create_and_check_model(
         token_type_ids,
         sequence_labels,
         token_labels,
-        label_ids,
+        labels,
         numeric_values,
         numeric_values_scale,
         float_answer,
@@ -259,7 +263,7 @@ def create_and_check_for_masked_lm(
         token_type_ids,
         sequence_labels,
         token_labels,
-        label_ids,
+        labels,
         numeric_values,
         numeric_values_scale,
         float_answer,
@@ -279,7 +283,7 @@ def create_and_check_for_question_answering(
         token_type_ids,
         sequence_labels,
         token_labels,
-        label_ids,
+        labels,
         numeric_values,
         numeric_values_scale,
         float_answer,
@@ -320,7 +324,7 @@ def create_and_check_for_question_answering(
             input_ids,
             attention_mask=input_mask,
             token_type_ids=token_type_ids,
-            label_ids=label_ids,
+            labels=labels,
         )
         self.parent.assertEqual(result.loss.shape, ())
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
@@ -333,7 +337,7 @@ def create_and_check_for_question_answering(
             input_ids=input_ids,
             attention_mask=input_mask,
             token_type_ids=token_type_ids,
-            label_ids=label_ids,
+            labels=labels,
             numeric_values=numeric_values,
             numeric_values_scale=numeric_values_scale,
             float_answer=float_answer,
@@ -352,7 +356,7 @@ def create_and_check_for_question_answering(
             input_ids,
             attention_mask=input_mask,
             token_type_ids=token_type_ids,
-            label_ids=label_ids,
+            labels=labels,
             aggregation_labels=aggregation_labels,
         )
         self.parent.assertEqual(result.loss.shape, ())
@@ -367,7 +371,7 @@ def create_and_check_for_sequence_classification(
         token_type_ids,
         sequence_labels,
         token_labels,
-        label_ids,
+        labels,
         numeric_values,
         numeric_values_scale,
         float_answer,
@@ -389,7 +393,7 @@ def prepare_config_and_inputs_for_common(self):
             token_type_ids,
             sequence_labels,
             token_labels,
-            label_ids,
+            labels,
             numeric_values,
             numeric_values_scale,
             float_answer,
@@ -418,6 +422,53 @@ class TapasModelTest(ModelTesterMixin, unittest.TestCase):
     test_resize_embeddings = True
     test_head_masking = False
 
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+        if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+            inputs_dict = {
+                k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
+                if isinstance(v, torch.Tensor) and v.ndim > 1
+                else v
+                for k, v in inputs_dict.items()
+            }
+
+        if return_labels:
+            if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+                inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
+            elif model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["aggregation_labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+                inputs_dict["numeric_values"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.float, device=torch_device
+                )
+                inputs_dict["numeric_values_scale"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.float, device=torch_device
+                )
+                inputs_dict["float_answer"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.float, device=torch_device
+                )
+            elif model_class in [
+                *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values(),
+                *MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values(),
+            ]:
+                inputs_dict["labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+            elif model_class in [
+                *MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(),
+                *MODEL_FOR_CAUSAL_LM_MAPPING.values(),
+                *MODEL_FOR_MASKED_LM_MAPPING.values(),
+                *MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(),
+            ]:
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
     def setUp(self):
         self.model_tester = TapasModelTester(self)
         self.config_tester = ConfigTester(self, config_class=TapasConfig, dim=37)
@@ -612,7 +663,7 @@ def test_training_question_answering_head_weak_supervision(self):
         input_ids = inputs["input_ids"].to(torch_device)
         attention_mask = inputs["attention_mask"].to(torch_device)
         token_type_ids = inputs["token_type_ids"].to(torch_device)
-        label_ids = inputs["label_ids"].to(torch_device)
+        labels = inputs["labels"].to(torch_device)
         numeric_values = inputs["numeric_values"].to(torch_device)
         numeric_values_scale = inputs["numeric_values_scale"].to(torch_device)
 
@@ -620,7 +671,7 @@ def test_training_question_answering_head_weak_supervision(self):
         float_answer = torch.FloatTensor(float_answer).to(torch_device)
 
         # forward pass to get loss + logits:
-        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label_ids=label_ids,
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels,
                         numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, 
                         float_answer=float_answer)
 
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 0095b9e2436c..dd4ae1a72981 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -584,7 +584,7 @@ def test_token_type_ids(self):
 
                 # We want to have sequence 0 and sequence 1 are tagged
                 # respectively with 0 and 1 token_ids
-                # (regardeless of weither the model use token type ids)
+                # (regardless of whether the model use token type ids)
                 # We use this assumption in the QA pipeline among other place
                 output = tokenizer(seq_0, return_token_type_ids=True)
                 self.assertIn(0, output["token_type_ids"])
@@ -600,7 +600,7 @@ def test_sequence_ids(self):
 
                 # We want to have sequence 0 and sequence 1 are tagged
                 # respectively with 0 and 1 token_ids
-                # (regardeless of weither the model use token type ids)
+                # (regardless of whether the model use token type ids)
                 # We use this assumption in the QA pipeline among other place
                 output = tokenizer(seq_0)
                 self.assertIn(0, output.sequence_ids())
diff --git a/tests/test_tokenization_tapas.py b/tests/test_tokenization_tapas.py
index 882ee04b49d5..6f30c78f85d2 100644
--- a/tests/test_tokenization_tapas.py
+++ b/tests/test_tokenization_tapas.py
@@ -23,7 +23,7 @@
 import pandas as pd
 
 from transformers import AddedToken
-from transformers.testing_utils import require_tokenizers, slow
+from transformers.testing_utils import require_tokenizers, slow, require_torch, is_pt_tf_cross_test
 from transformers.models.tapas.tokenization_tapas import (
     VOCAB_FILES_NAMES,
     BasicTokenizer,
@@ -34,7 +34,7 @@
     _is_whitespace,
 )
 
-from .test_tokenization_common import TokenizerTesterMixin, filter_non_english
+from .test_tokenization_common import TokenizerTesterMixin, filter_non_english, merge_model_tokenizer_mappings
 
 
 @require_tokenizers
@@ -294,16 +294,17 @@ def test_clean_text(self):
 
     @slow
     def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("tapas-base-uncased")
+        tokenizer = self.tokenizer_class.from_pretrained("nielsr/tapas-base-finetuned-wtq")
 
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+        empty_table = self.get_table(tokenizer, length=0)
+        table = self.get_table(tokenizer, length=10)
+
+        text = tokenizer.encode(table, add_special_tokens=False)
+        text_2 = tokenizer.encode(empty_table, "multi-sequence build", add_special_tokens=False)
 
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
 
-        assert encoded_sentence == [101] + text + [102]
-        assert encoded_pair == [101] + text + [102] + text_2 + [102]
+        assert encoded_pair == [101] + text + [102] + text_2
 
     def test_offsets_with_special_characters(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
@@ -998,6 +999,118 @@ def test_right_and_left_padding(self):
                 assert sequence_length == padded_sequence_left_length
                 assert encoded_sequence == padded_sequence_left
 
+    def test_token_type_ids(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                empty_table = self.get_table(tokenizer, length=0)
+                seq_0 = "Test this method."
+
+                # We want to have sequence 0 and sequence 1 are tagged
+                # respectively with 0 and 1 token_ids
+                # (regardless of whether the model use token type ids)
+                # We use this assumption in the QA pipeline among other place
+                output = tokenizer(empty_table, seq_0, return_token_type_ids=True)
+
+                # Assert that the token type IDs have the same length as the input IDs
+                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
+
+                # Assert that each token type ID has 7 values
+                self.assertTrue(all(len(token_type_ids) == 7 for token_type_ids in output["token_type_ids"]))
+
+                # Do the same test as modeling common.
+                self.assertIn(0, output["token_type_ids"][0])
+
+    # TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available
+    @require_torch
+    @slow
+    def test_np_encode_plus_sent_to_model(self):
+        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
+
+        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
+
+        tokenizer = self.get_tokenizer()
+        if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
+            return
+
+        config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
+        config = config_class()
+
+        if config.is_encoder_decoder or config.pad_token_id is None:
+            return
+
+        # Build sequence
+        first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
+        table = self.get_table(tokenizer, length=0)
+        sequence = " ".join(first_ten_tokens)
+        encoded_sequence = tokenizer.encode_plus(table, sequence, return_tensors="np")
+        batch_encoded_sequence = tokenizer.batch_encode_plus(table, [sequence, sequence], return_tensors="np")
+
+        # TODO: add forward through JAX/Flax when PR is merged
+        # This is currently here to make flake8 happy !
+        if encoded_sequence is None:
+            raise ValueError("Cannot convert list to numpy tensor on  encode_plus()")
+
+        if batch_encoded_sequence is None:
+            raise ValueError("Cannot convert list to numpy tensor on  batch_encode_plus()")
+
+        if self.test_rust_tokenizer:
+            fast_tokenizer = self.get_rust_tokenizer()
+            encoded_sequence_fast = fast_tokenizer.encode_plus(table, sequence, return_tensors="np")
+            batch_encoded_sequence_fast = fast_tokenizer.batch_encode_plus(table, [sequence, sequence], return_tensors="np")
+
+            # TODO: add forward through JAX/Flax when PR is merged
+            # This is currently here to make flake8 happy !
+            if encoded_sequence_fast is None:
+                raise ValueError("Cannot convert list to numpy tensor on  encode_plus() (fast)")
+
+            if batch_encoded_sequence_fast is None:
+                raise ValueError("Cannot convert list to numpy tensor on  batch_encode_plus() (fast)")
+
+    @require_torch
+    @slow
+    def test_torch_encode_plus_sent_to_model(self):
+        import torch
+
+        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
+
+        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
+
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+
+                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
+                    return
+
+                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
+                config = config_class()
+
+                if config.is_encoder_decoder or config.pad_token_id is None:
+                    return
+
+                model = model_class(config)
+
+                # Make sure the model contains at least the full vocabulary size in its embedding matrix
+                is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight")
+                assert (
+                    (model.get_input_embeddings().weight.shape[0] >= len(tokenizer))
+                    if is_using_common_embeddings
+                    else True
+                )
+
+                # Build sequence
+                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
+                sequence = " ".join(first_ten_tokens)
+                table = self.get_table(tokenizer, length=0)
+                encoded_sequence = tokenizer.encode_plus(table, sequence, return_tensors="pt")
+                batch_encoded_sequence = tokenizer.batch_encode_plus(table, [sequence, sequence], return_tensors="pt")
+                # This should not fail
+
+                with torch.no_grad():  # saves some time
+                    model(**encoded_sequence)
+                    model(**batch_encoded_sequence)
+
     @unittest.skip("TAPAS doesn't handle pre-tokenized inputs.")
     def test_pretokenized_inputs(self):
         pass
@@ -1031,6 +1144,52 @@ def test_tapas_truncation_integration_test(self):
             # Ensure that the input IDs are less than the max length defined.
             self.assertLessEqual(len(new_encoded_inputs), i)
 
+    @is_pt_tf_cross_test
+    def test_batch_encode_plus_tensors(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                table = self.get_table(tokenizer, length=0)
+
+                # A Tensor cannot be build by sequences which are not the same size
+                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="pt")
+                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="tf")
+
+                if tokenizer.pad_token_id is None:
+                    self.assertRaises(
+                        ValueError,
+                        tokenizer.batch_encode_plus,
+                        table,
+                        sequences,
+                        padding=True,
+                        return_tensors="pt",
+                    )
+                    self.assertRaises(
+                        ValueError,
+                        tokenizer.batch_encode_plus,
+                        table,
+                        sequences,
+                        padding="longest",
+                        return_tensors="tf",
+                    )
+                else:
+                    pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="pt")
+                    tensorflow_tensor = tokenizer.batch_encode_plus(table, sequences, padding="longest", return_tensors="tf")
+                    encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True)
+
+                    for key in encoded_sequences.keys():
+                        pytorch_value = pytorch_tensor[key].tolist()
+                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
+                        encoded_value = encoded_sequences[key]
+
+                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
+
     # TODO SET TO SLOW
     def test_tapas_integration_test(self):
         data = {