From e77d866a87f428fb1f4816c60ebffb5cf14064e9 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 8 Dec 2020 16:26:11 -0500 Subject: [PATCH] Fix remaining tests --- .../models/tapas/modeling_tapas.py | 58 +++--- .../models/tapas/tokenization_tapas.py | 33 ++-- tests/test_modeling_tapas.py | 79 ++++++-- tests/test_tokenization_common.py | 4 +- tests/test_tokenization_tapas.py | 175 +++++++++++++++++- 5 files changed, 276 insertions(+), 73 deletions(-) diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index 6a4167b8a59d..147eea6d4c6d 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -70,7 +70,7 @@ class TableQuestionAnsweringOutput(ModelOutput): Output type of :class:`~transformers.TapasForQuestionAnswering`. Args: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label_ids` (and possibly :obj:`answer`, :obj:`aggregation_labels`, :obj:`numeric_values` and :obj:`numeric_values_scale` are provided)): + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` (and possibly :obj:`answer`, :obj:`aggregation_labels`, :obj:`numeric_values` and :obj:`numeric_values_scale` are provided)): Total loss as the sum of the hierarchical cell selection log-likelihood loss and (optionally) the semi-supervised regression loss and (optionally) supervised loss for aggregations. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`): @@ -1018,7 +1018,7 @@ def forward(self, features, **kwargs): TAPAS_START_DOCSTRING, ) class TapasForQuestionAnswering(TapasPreTrainedModel): - def __init__(self, config): + def __init__(self, config: TapasConfig): super().__init__(config) # base model @@ -1036,11 +1036,11 @@ def __init__(self, config): else: self.output_weights = nn.Parameter(torch.empty(config.hidden_size)) nn.init.normal_( - self.output_weights, std=0.02 + self.output_weights, std=config.initializer_range ) # here, a truncated normal is used in the original implementation self.column_output_weights = nn.Parameter(torch.empty(config.hidden_size)) nn.init.normal_( - self.column_output_weights, std=0.02 + self.column_output_weights, std=config.initializer_range ) # here, a truncated normal is used in the original implementation self.output_bias = nn.Parameter(torch.zeros([])) self.column_output_bias = nn.Parameter(torch.zeros([])) @@ -1062,7 +1062,7 @@ def forward( head_mask=None, inputs_embeds=None, table_mask=None, - label_ids=None, + labels=None, aggregation_labels=None, float_answer=None, numeric_values=None, @@ -1075,7 +1075,7 @@ def forward( table_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, seq_length)`, `optional`): Mask for the table. Indicates which tokens belong to the table (1). Question tokens, table headers and padding are 0. - label_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, seq_length)`, `optional`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, seq_length)`, `optional`): Labels per token for computing the hierarchical cell selection loss. This encodes the positions of the answer appearing in the table. Can be obtained using :class:`~transformers.TapasTokenizer`. @@ -1156,7 +1156,7 @@ def forward( "segment_ids", "column_ids", "row_ids", - "prev_label_ids", + "prev_labels", "column_ranks", "inv_column_ranks", "numeric_relations", @@ -1214,7 +1214,7 @@ def forward( # Total loss calculation total_loss = 0.0 calculate_loss = False - if label_ids is not None: + if labels is not None: calculate_loss = True is_supervised = not self.config.num_aggregation_labels > 0 or not self.config.use_answer_as_supervision @@ -1226,18 +1226,18 @@ def forward( # some ambiguous cases, see utils._calculate_aggregate_mask for more info. # `aggregate_mask` is 1 for examples where we chose to aggregate and 0 # for examples where we chose to select the answer directly. - # `label_ids` encodes the positions of the answer appearing in the table. + # `labels` encodes the positions of the answer appearing in the table. if is_supervised: aggregate_mask = None else: if float_answer is not None: - assert label_ids.shape[0] == float_answer.shape[0], "Make sure the answers are a FloatTensor of shape (batch_size,)" + assert labels.shape[0] == float_answer.shape[0], "Make sure the answers are a FloatTensor of shape (batch_size,)" # [batch_size] aggregate_mask = _calculate_aggregate_mask( float_answer, pooled_output, self.config.cell_selection_preference, - label_ids, + labels, self.aggregation_classifier, ) else: @@ -1255,17 +1255,17 @@ def forward( selection_loss_per_example = None if not self.config.select_one_column: weight = torch.where( - label_ids == 0, - torch.ones_like(label_ids, dtype=torch.float32), - self.config.positive_label_weight * torch.ones_like(label_ids, dtype=torch.float32), + labels == 0, + torch.ones_like(labels, dtype=torch.float32), + self.config.positive_label_weight * torch.ones_like(labels, dtype=torch.float32), ) - selection_loss_per_token = -dist_per_token.log_prob(label_ids) * weight + selection_loss_per_token = -dist_per_token.log_prob(labels) * weight selection_loss_per_example = torch.sum(selection_loss_per_token * input_mask_float, dim=1) / ( torch.sum(input_mask_float, dim=1) + EPSILON_ZERO_DIVISION ) else: selection_loss_per_example, logits = _single_column_cell_selection_loss( - logits, column_logits, label_ids, cell_index, col_index, cell_mask + logits, column_logits, labels, cell_index, col_index, cell_mask ) dist_per_token = torch.distributions.Bernoulli(logits=logits) @@ -1285,7 +1285,7 @@ def forward( if is_supervised: # Note that `aggregate_mask` is None if the setting is supervised. if aggregation_labels is not None: - assert label_ids.shape[0] == aggregation_labels.shape[0], "Make sure the aggregation labels are a LongTensor of shape (batch_size,)" + assert labels.shape[0] == aggregation_labels.shape[0], "Make sure the aggregation labels are a LongTensor of shape (batch_size,)" per_example_additional_loss = _calculate_aggregation_loss( logits_aggregation, aggregate_mask, aggregation_labels, self.config.use_answer_as_supervision, self.config.num_aggregation_labels, @@ -1297,7 +1297,7 @@ def forward( ) else: # Set aggregation labels to zeros - aggregation_labels = torch.zeros(label_ids.shape[0], dtype=torch.long, device=label_ids.device) + aggregation_labels = torch.zeros(labels.shape[0], dtype=torch.long, device=labels.device) per_example_additional_loss = _calculate_aggregation_loss( logits_aggregation, aggregate_mask, aggregation_labels, self.config.use_answer_as_supervision, self.config.num_aggregation_labels, @@ -1330,16 +1330,16 @@ def forward( else: # if no label ids are provided, set them to zeros in order to properly compute logits - label_ids = torch.zeros_like(logits) + labels = torch.zeros_like(logits) _, logits = _single_column_cell_selection_loss( - logits, column_logits, label_ids, cell_index, col_index, cell_mask + logits, column_logits, labels, cell_index, col_index, cell_mask ) if not return_dict: output = (logits, logits_aggregation) + outputs[2:] return ((total_loss,) + output) if calculate_loss else output return TableQuestionAnsweringOutput( - loss=total_loss, + loss=total_loss if calculate_loss else None, logits=logits, logits_aggregation=logits_aggregation, hidden_states=outputs.hidden_states, @@ -1854,7 +1854,7 @@ def compute_column_logits( return column_logits -def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, cell_index, col_index, cell_mask): +def _single_column_cell_selection_loss(token_logits, column_logits, labels, cell_index, col_index, cell_mask): """ Computes the loss for cell selection constrained to a single column. The loss is a hierarchical log-likelihood. The model first predicts a column and then selects cells within that column (conditioned on the column). Cells outside @@ -1865,7 +1865,7 @@ def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, c Tensor containing the logits per token. column_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, max_num_cols)`): Tensor containing the logits per column. - label_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Labels per token. cell_index (:obj:`ProductIndexMap`): Index that groups tokens into cells. @@ -1885,7 +1885,7 @@ def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, c # First find the column we should select. We use the column with maximum # number of selected cells. labels_per_column, _ = reduce_sum( - torch.as_tensor(label_ids, dtype=torch.float32, device=label_ids.device), col_index + torch.as_tensor(labels, dtype=torch.float32, device=labels.device), col_index ) # shape of labels_per_column is (batch_size, max_num_cols). It contains the number of label ids for every column, for every example column_label = torch.argmax(labels_per_column, dim=-1) # shape (batch_size,) @@ -1894,7 +1894,7 @@ def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, c no_cell_selected = torch.eq( torch.max(labels_per_column, dim=-1)[0], 0 ) # no_cell_selected is of shape (batch_size,) and equals True - # if an example of the batch has no cells selected (i.e. if there are no label_ids set to 1 for that example) + # if an example of the batch has no cells selected (i.e. if there are no labels set to 1 for that example) column_label = torch.where( no_cell_selected.view(column_label.size()), torch.zeros_like(column_label), column_label ) @@ -1909,7 +1909,7 @@ def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, c logits_per_cell, _ = reduce_mean(token_logits, cell_index) # labels_per_cell: shape (batch_size, 64*32), indicating whether each cell should be selected (1) or not (0) labels_per_cell, labels_index = reduce_max( - torch.as_tensor(label_ids, dtype=torch.long, device=label_ids.device), cell_index + torch.as_tensor(labels, dtype=torch.long, device=labels.device), cell_index ) # Mask for the selected column. @@ -1986,7 +1986,7 @@ def compute_token_logits(sequence_output, temperature, output_weights, output_bi return logits -def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference, label_ids, aggregation_classifier): +def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference, labels, aggregation_classifier): """ Finds examples where the model should select cells with no aggregation. @@ -2004,7 +2004,7 @@ def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference, Output of the pooler (BertPooler) on top of the encoder layer. cell_selection_preference (:obj:`float`): Preference for cell selection in ambiguous cases. - label_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Labels per token. aggregation_classifier (:obj:`torch.nn.Linear`): Aggregation head Returns: @@ -2022,7 +2022,7 @@ def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference, is_pred_cell_selection = aggregation_ops_total_mass <= cell_selection_preference # Examples with non-empty cell selection supervision. - is_cell_supervision_available = torch.sum(label_ids, dim=1) > 0 + is_cell_supervision_available = torch.sum(labels, dim=1) > 0 # torch.where is not equivalent to tf.where (in tensorflow 1) # hence the added .view on the condition to match the shape of the first tensor diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index f4b8cc8e858e..a445892a3587 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -64,12 +64,6 @@ } -PRETRAINED_INIT_CONFIGURATION = { - "nielsr/tapas-base-finetuned-sqa": {"do_lower_case": True}, - "nielsr/tapas-base-finetuned-wtq": {"do_lower_case": True}, - "nielsr/tapas-base-finetuned-wikisql-supervised": {"do_lower_case": True}, -} - class TapasTruncationStrategy(ExplicitEnum): """ @@ -178,7 +172,7 @@ class TapasTokenizer(PreTrainedTokenizer): Users should refer to this superclass for more information regarding those methods. :class:`~transformers.TapasTokenizer` creates several token type ids to encode tabular structure. To be more precise, it adds 7 token type ids, in the following order: :obj:`segment_ids`, :obj:`column_ids`, :obj:`row_ids`, - :obj:`prev_label_ids`, :obj:`column_ranks`, :obj:`inv_column_ranks` and :obj:`numeric_relations`: + :obj:`prev_labels`, :obj:`column_ranks`, :obj:`inv_column_ranks` and :obj:`numeric_relations`: - segment_ids: indicate whether a token belongs to the question (0) or the table (1). 0 for special tokens and padding. @@ -186,7 +180,7 @@ class TapasTokenizer(PreTrainedTokenizer): tokens, special tokens and padding. - row_ids: indicate to which row of the table a token belongs (starting from 1). Is 0 for all question tokens, special tokens and padding. Tokens of column headers are also 0. - - prev_label_ids: indicate whether a token was (part of) an answer to the previous question (1) or not (0). Useful + - prev_labels: indicate whether a token was (part of) an answer to the previous question (1) or not (0). Useful in a conversational setup (such as SQA). - column_ranks: indicate the rank of a table token relative to a column, if applicable. For example, if you have a column "number of movies" with values 87, 53 and 69, then the column ranks of these tokens are 3, 1 and 2 respectively. @@ -252,7 +246,6 @@ class TapasTokenizer(PreTrainedTokenizer): vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION def __init__( self, @@ -1153,10 +1146,10 @@ def prepare_for_model( column_ids = self.create_column_token_type_ids_from_sequences(query_ids, table_data) row_ids = self.create_row_token_type_ids_from_sequences(query_ids, table_data) if not is_part_of_batch or (prev_answer_coordinates is None and prev_answer_text is None): - # simply set the prev_label_ids to zeros - prev_label_ids = [0] * len(row_ids) + # simply set the prev_labels to zeros + prev_labels = [0] * len(row_ids) else: - prev_label_ids = self.get_answer_ids( + prev_labels = self.get_answer_ids( column_ids, row_ids, table_data, prev_answer_text, prev_answer_coordinates ) @@ -1185,13 +1178,13 @@ def prepare_for_model( encoded_inputs["attention_mask"] = attention_mask if answer_coordinates is not None and answer_text is not None: - label_ids = self.get_answer_ids( + labels = self.get_answer_ids( column_ids, row_ids, table_data, answer_text, answer_coordinates ) numeric_values = self._get_numeric_values(raw_table, column_ids, row_ids) numeric_values_scale = self._get_numeric_values_scale(raw_table, column_ids, row_ids) - encoded_inputs["label_ids"] = label_ids + encoded_inputs["labels"] = labels encoded_inputs["numeric_values"] = numeric_values encoded_inputs["numeric_values_scale"] = numeric_values_scale @@ -1200,7 +1193,7 @@ def prepare_for_model( segment_ids, column_ids, row_ids, - prev_label_ids, + prev_labels, column_ranks, inv_column_ranks, numeric_relations, @@ -1829,8 +1822,8 @@ def _pad( encoded_inputs["token_type_ids"] = ( encoded_inputs["token_type_ids"] + [[self.pad_token_type_id] * 7] * difference ) - if "label_ids" in encoded_inputs: - encoded_inputs["label_ids"] = encoded_inputs["label_ids"] + [0] * difference + if "labels" in encoded_inputs: + encoded_inputs["labels"] = encoded_inputs["labels"] + [0] * difference if "numeric_values" in encoded_inputs: encoded_inputs["numeric_values"] = encoded_inputs["numeric_values"] + [float("nan")] * difference if "numeric_values_scale" in encoded_inputs: @@ -1845,8 +1838,8 @@ def _pad( encoded_inputs["token_type_ids"] = [[self.pad_token_type_id] * 7] * difference + encoded_inputs[ "token_type_ids" ] - if "label_ids" in encoded_inputs: - encoded_inputs["label_ids"] = [0] * difference + encoded_inputs["label_ids"] + if "labels" in encoded_inputs: + encoded_inputs["labels"] = [0] * difference + encoded_inputs["labels"] if "numeric_values" in encoded_inputs: encoded_inputs["numeric_values"] = [float("nan")] * difference + encoded_inputs["numeric_values"] if "numeric_values_scale" in encoded_inputs: @@ -1918,7 +1911,7 @@ def convert_logits_to_predictions( "segment_ids", "column_ids", "row_ids", - "prev_label_ids", + "prev_labels", "column_ranks", "inv_column_ranks", "numeric_relations", diff --git a/tests/test_modeling_tapas.py b/tests/test_modeling_tapas.py index 5e35c28e274b..160b33f22516 100644 --- a/tests/test_modeling_tapas.py +++ b/tests/test_modeling_tapas.py @@ -21,7 +21,10 @@ import numpy as np import pandas as pd -from transformers import is_torch_available +from transformers import is_torch_available, MODEL_FOR_MULTIPLE_CHOICE_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, \ + MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, \ + MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING, MODEL_FOR_MASKED_LM_MAPPING, \ + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING from transformers.file_utils import cached_property from transformers.testing_utils import require_torch, require_scatter, slow, torch_device @@ -29,6 +32,7 @@ from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + if is_torch_available(): import torch @@ -162,7 +166,7 @@ def prepare_config_and_inputs(self): sequence_labels = None token_labels = None - label_ids = None + labels = None answer = None numeric_values = None numeric_values_scale = None @@ -171,7 +175,7 @@ def prepare_config_and_inputs(self): if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size).to(torch_device) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels).to(torch_device) - label_ids = ids_tensor([self.batch_size, self.seq_length], vocab_size=2).to(torch_device) + labels = ids_tensor([self.batch_size, self.seq_length], vocab_size=2).to(torch_device) numeric_values = floats_tensor([self.batch_size, self.seq_length]).to(torch_device) numeric_values_scale = floats_tensor([self.batch_size, self.seq_length]).to(torch_device) float_answer = floats_tensor([self.batch_size]).to(torch_device) @@ -221,7 +225,7 @@ def prepare_config_and_inputs(self): token_type_ids, sequence_labels, token_labels, - label_ids, + labels, numeric_values, numeric_values_scale, float_answer, @@ -236,7 +240,7 @@ def create_and_check_model( token_type_ids, sequence_labels, token_labels, - label_ids, + labels, numeric_values, numeric_values_scale, float_answer, @@ -259,7 +263,7 @@ def create_and_check_for_masked_lm( token_type_ids, sequence_labels, token_labels, - label_ids, + labels, numeric_values, numeric_values_scale, float_answer, @@ -279,7 +283,7 @@ def create_and_check_for_question_answering( token_type_ids, sequence_labels, token_labels, - label_ids, + labels, numeric_values, numeric_values_scale, float_answer, @@ -320,7 +324,7 @@ def create_and_check_for_question_answering( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, - label_ids=label_ids, + labels=labels, ) self.parent.assertEqual(result.loss.shape, ()) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) @@ -333,7 +337,7 @@ def create_and_check_for_question_answering( input_ids=input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, - label_ids=label_ids, + labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, float_answer=float_answer, @@ -352,7 +356,7 @@ def create_and_check_for_question_answering( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, - label_ids=label_ids, + labels=labels, aggregation_labels=aggregation_labels, ) self.parent.assertEqual(result.loss.shape, ()) @@ -367,7 +371,7 @@ def create_and_check_for_sequence_classification( token_type_ids, sequence_labels, token_labels, - label_ids, + labels, numeric_values, numeric_values_scale, float_answer, @@ -389,7 +393,7 @@ def prepare_config_and_inputs_for_common(self): token_type_ids, sequence_labels, token_labels, - label_ids, + labels, numeric_values, numeric_values_scale, float_answer, @@ -418,6 +422,53 @@ class TapasModelTest(ModelTesterMixin, unittest.TestCase): test_resize_embeddings = True test_head_masking = False + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = copy.deepcopy(inputs_dict) + if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values(): + inputs_dict = { + k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous() + if isinstance(v, torch.Tensor) and v.ndim > 1 + else v + for k, v in inputs_dict.items() + } + + if return_labels: + if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values(): + inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device) + elif model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values(): + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device + ) + inputs_dict["aggregation_labels"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + inputs_dict["numeric_values"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.float, device=torch_device + ) + inputs_dict["numeric_values_scale"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.float, device=torch_device + ) + inputs_dict["float_answer"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.float, device=torch_device + ) + elif model_class in [ + *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values(), + *MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values(), + ]: + inputs_dict["labels"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + elif model_class in [ + *MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(), + *MODEL_FOR_CAUSAL_LM_MAPPING.values(), + *MODEL_FOR_MASKED_LM_MAPPING.values(), + *MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(), + ]: + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device + ) + return inputs_dict + def setUp(self): self.model_tester = TapasModelTester(self) self.config_tester = ConfigTester(self, config_class=TapasConfig, dim=37) @@ -612,7 +663,7 @@ def test_training_question_answering_head_weak_supervision(self): input_ids = inputs["input_ids"].to(torch_device) attention_mask = inputs["attention_mask"].to(torch_device) token_type_ids = inputs["token_type_ids"].to(torch_device) - label_ids = inputs["label_ids"].to(torch_device) + labels = inputs["labels"].to(torch_device) numeric_values = inputs["numeric_values"].to(torch_device) numeric_values_scale = inputs["numeric_values_scale"].to(torch_device) @@ -620,7 +671,7 @@ def test_training_question_answering_head_weak_supervision(self): float_answer = torch.FloatTensor(float_answer).to(torch_device) # forward pass to get loss + logits: - outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label_ids=label_ids, + outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, float_answer=float_answer) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 0095b9e2436c..dd4ae1a72981 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -584,7 +584,7 @@ def test_token_type_ids(self): # We want to have sequence 0 and sequence 1 are tagged # respectively with 0 and 1 token_ids - # (regardeless of weither the model use token type ids) + # (regardless of whether the model use token type ids) # We use this assumption in the QA pipeline among other place output = tokenizer(seq_0, return_token_type_ids=True) self.assertIn(0, output["token_type_ids"]) @@ -600,7 +600,7 @@ def test_sequence_ids(self): # We want to have sequence 0 and sequence 1 are tagged # respectively with 0 and 1 token_ids - # (regardeless of weither the model use token type ids) + # (regardless of whether the model use token type ids) # We use this assumption in the QA pipeline among other place output = tokenizer(seq_0) self.assertIn(0, output.sequence_ids()) diff --git a/tests/test_tokenization_tapas.py b/tests/test_tokenization_tapas.py index 882ee04b49d5..6f30c78f85d2 100644 --- a/tests/test_tokenization_tapas.py +++ b/tests/test_tokenization_tapas.py @@ -23,7 +23,7 @@ import pandas as pd from transformers import AddedToken -from transformers.testing_utils import require_tokenizers, slow +from transformers.testing_utils import require_tokenizers, slow, require_torch, is_pt_tf_cross_test from transformers.models.tapas.tokenization_tapas import ( VOCAB_FILES_NAMES, BasicTokenizer, @@ -34,7 +34,7 @@ _is_whitespace, ) -from .test_tokenization_common import TokenizerTesterMixin, filter_non_english +from .test_tokenization_common import TokenizerTesterMixin, filter_non_english, merge_model_tokenizer_mappings @require_tokenizers @@ -294,16 +294,17 @@ def test_clean_text(self): @slow def test_sequence_builders(self): - tokenizer = self.tokenizer_class.from_pretrained("tapas-base-uncased") + tokenizer = self.tokenizer_class.from_pretrained("nielsr/tapas-base-finetuned-wtq") - text = tokenizer.encode("sequence builders", add_special_tokens=False) - text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) + empty_table = self.get_table(tokenizer, length=0) + table = self.get_table(tokenizer, length=10) + + text = tokenizer.encode(table, add_special_tokens=False) + text_2 = tokenizer.encode(empty_table, "multi-sequence build", add_special_tokens=False) - encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) - assert encoded_sentence == [101] + text + [102] - assert encoded_pair == [101] + text + [102] + text_2 + [102] + assert encoded_pair == [101] + text + [102] + text_2 def test_offsets_with_special_characters(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: @@ -998,6 +999,118 @@ def test_right_and_left_padding(self): assert sequence_length == padded_sequence_left_length assert encoded_sequence == padded_sequence_left + def test_token_type_ids(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + empty_table = self.get_table(tokenizer, length=0) + seq_0 = "Test this method." + + # We want to have sequence 0 and sequence 1 are tagged + # respectively with 0 and 1 token_ids + # (regardless of whether the model use token type ids) + # We use this assumption in the QA pipeline among other place + output = tokenizer(empty_table, seq_0, return_token_type_ids=True) + + # Assert that the token type IDs have the same length as the input IDs + self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"])) + + # Assert that each token type ID has 7 values + self.assertTrue(all(len(token_type_ids) == 7 for token_type_ids in output["token_type_ids"])) + + # Do the same test as modeling common. + self.assertIn(0, output["token_type_ids"][0]) + + # TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available + @require_torch + @slow + def test_np_encode_plus_sent_to_model(self): + from transformers import MODEL_MAPPING, TOKENIZER_MAPPING + + MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING) + + tokenizer = self.get_tokenizer() + if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: + return + + config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__] + config = config_class() + + if config.is_encoder_decoder or config.pad_token_id is None: + return + + # Build sequence + first_ten_tokens = list(tokenizer.get_vocab().keys())[:10] + table = self.get_table(tokenizer, length=0) + sequence = " ".join(first_ten_tokens) + encoded_sequence = tokenizer.encode_plus(table, sequence, return_tensors="np") + batch_encoded_sequence = tokenizer.batch_encode_plus(table, [sequence, sequence], return_tensors="np") + + # TODO: add forward through JAX/Flax when PR is merged + # This is currently here to make flake8 happy ! + if encoded_sequence is None: + raise ValueError("Cannot convert list to numpy tensor on encode_plus()") + + if batch_encoded_sequence is None: + raise ValueError("Cannot convert list to numpy tensor on batch_encode_plus()") + + if self.test_rust_tokenizer: + fast_tokenizer = self.get_rust_tokenizer() + encoded_sequence_fast = fast_tokenizer.encode_plus(table, sequence, return_tensors="np") + batch_encoded_sequence_fast = fast_tokenizer.batch_encode_plus(table, [sequence, sequence], return_tensors="np") + + # TODO: add forward through JAX/Flax when PR is merged + # This is currently here to make flake8 happy ! + if encoded_sequence_fast is None: + raise ValueError("Cannot convert list to numpy tensor on encode_plus() (fast)") + + if batch_encoded_sequence_fast is None: + raise ValueError("Cannot convert list to numpy tensor on batch_encode_plus() (fast)") + + @require_torch + @slow + def test_torch_encode_plus_sent_to_model(self): + import torch + + from transformers import MODEL_MAPPING, TOKENIZER_MAPPING + + MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING) + + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + + if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: + return + + config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__] + config = config_class() + + if config.is_encoder_decoder or config.pad_token_id is None: + return + + model = model_class(config) + + # Make sure the model contains at least the full vocabulary size in its embedding matrix + is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight") + assert ( + (model.get_input_embeddings().weight.shape[0] >= len(tokenizer)) + if is_using_common_embeddings + else True + ) + + # Build sequence + first_ten_tokens = list(tokenizer.get_vocab().keys())[:10] + sequence = " ".join(first_ten_tokens) + table = self.get_table(tokenizer, length=0) + encoded_sequence = tokenizer.encode_plus(table, sequence, return_tensors="pt") + batch_encoded_sequence = tokenizer.batch_encode_plus(table, [sequence, sequence], return_tensors="pt") + # This should not fail + + with torch.no_grad(): # saves some time + model(**encoded_sequence) + model(**batch_encoded_sequence) + @unittest.skip("TAPAS doesn't handle pre-tokenized inputs.") def test_pretokenized_inputs(self): pass @@ -1031,6 +1144,52 @@ def test_tapas_truncation_integration_test(self): # Ensure that the input IDs are less than the max length defined. self.assertLessEqual(len(new_encoded_inputs), i) + @is_pt_tf_cross_test + def test_batch_encode_plus_tensors(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + sequences = [ + "Testing batch encode plus", + "Testing batch encode plus with different sequence lengths", + "Testing batch encode plus with different sequence lengths correctly pads", + ] + + table = self.get_table(tokenizer, length=0) + + # A Tensor cannot be build by sequences which are not the same size + self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="pt") + self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="tf") + + if tokenizer.pad_token_id is None: + self.assertRaises( + ValueError, + tokenizer.batch_encode_plus, + table, + sequences, + padding=True, + return_tensors="pt", + ) + self.assertRaises( + ValueError, + tokenizer.batch_encode_plus, + table, + sequences, + padding="longest", + return_tensors="tf", + ) + else: + pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="pt") + tensorflow_tensor = tokenizer.batch_encode_plus(table, sequences, padding="longest", return_tensors="tf") + encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True) + + for key in encoded_sequences.keys(): + pytorch_value = pytorch_tensor[key].tolist() + tensorflow_value = tensorflow_tensor[key].numpy().tolist() + encoded_value = encoded_sequences[key] + + self.assertEqual(pytorch_value, tensorflow_value, encoded_value) + # TODO SET TO SLOW def test_tapas_integration_test(self): data = {