NielsRogge · NielsRogge · Dec 10, 2020 · Dec 9, 2020 · Dec 10, 2020 · Dec 10, 2020
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -102,6 +102,7 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
             - save_cache:
                 key: v0.4-{{ checksum "setup.py" }}
                 paths:
@@ -129,6 +130,7 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
             - save_cache:
                   key: v0.4-torch-{{ checksum "setup.py" }}
                   paths:
@@ -210,6 +212,7 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
             - save_cache:
                   key: v0.4-torch-{{ checksum "setup.py" }}
                   paths:

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -176,9 +176,9 @@ and conversion utilities for the following models:
 30. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
     Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
     Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-31. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
-    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
-    Francesco Piccinno and Julian Martin Eisenschlos.
+31. `TAPAS <https://huggingface.co/transformers/master/model_doc/tapas.html>`__ released with the paper `TAPAS: Weakly
+    Supervised Table Parsing via Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof
+    Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 32. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
     Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
     Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
@@ -272,6 +272,8 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
@@ -216,6 +216,15 @@
     _tokenizers_available = False
 
 
+try:
+    import pandas  # noqa: F401
+
+    _pandas_available = True
+
+except ImportError:
+    _pandas_available = False
+
+
 try:
     import torch_scatter
 
@@ -343,6 +352,10 @@ def is_scatter_available():
     return _scatter_available
 
 
+def is_pandas_available():
+    return _pandas_available
+
+
 def torch_only_method(fn):
     def wrapper(*args, **kwargs):
         if not _torch_available:

diff --git a/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
@@ -19,13 +19,9 @@
 
 import torch
 
-from transformers import (
-    TapasConfig,
-    TapasForQuestionAnswering,
-    TapasForSequenceClassification,
-    TapasModel,
-    load_tf_weights_in_tapas,
-)
+from transformers import TapasForQuestionAnswering  # noqa F401
+from transformers import TapasForSequenceClassification  # noqa F401
+from transformers import TapasConfig, TapasModel, load_tf_weights_in_tapas
 from transformers.utils import logging
 
 

diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
@@ -358,7 +358,6 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs
         return embeddings
 
 
-# Copied from transformers.modeling_bert.BertSelfAttention with Bert->Tapas
 class TapasSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -437,7 +436,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.modeling_bert.BertSelfOutput
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
 class TapasSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -452,7 +451,7 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-# Copied from transformers.modeling_bert.BertAttention with Bert->Tapas
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Tapas
 class TapasAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -500,7 +499,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.modeling_bert.BertIntermediate
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
 class TapasIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -516,7 +515,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.modeling_bert.BertOutput
+# Copied from transformers.models.bert.modeling_bert.BertOutput
 class TapasOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -531,7 +530,7 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-# Copied from transformers.modeling_bert.BertLayer with Bert->Tapas
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Tapas
 class TapasLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -591,7 +590,6 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
-# Copied from transformers.modeling_bert.BertEncoder with Bert->Tapas
 class TapasEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -656,7 +654,7 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.modeling_bert.BertPooler
+# Copied from transformers.models.bert.modeling_bert.BertPooler
 class TapasPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -681,7 +679,7 @@ class TapasPreTrainedModel(PreTrainedModel):
     config_class = TapasConfig
     base_model_prefix = "tapas"
 
-    # Copied from transformers.modeling_bert.BertPreTrainedModel._init_weights
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
         """ Initialize the weights """
         if isinstance(module, (nn.Linear, nn.Embedding)):
@@ -927,6 +925,9 @@ def __init__(self, config):
     def get_output_embeddings(self):
         return self.lm_head
 
+    def set_output_embeddings(self, word_embeddings):
+        self.lm_head = word_embeddings
+
     @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(

diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
@@ -26,11 +26,11 @@
 from dataclasses import dataclass
 from typing import Callable, Dict, Generator, List, Optional, Text, Tuple, Union
 
-import pandas as pd
-import torch
+import numpy as np
 
 from transformers import add_end_docstrings
 
+from ...file_utils import is_pandas_available
 from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
@@ -45,6 +45,9 @@
 from ...utils import logging
 
 
+if is_pandas_available():
+    import pandas as pd
+
 logger = logging.get_logger(__name__)
 
 
@@ -307,6 +310,9 @@ def __init__(
         additional_special_tokens: Optional[List[str]] = None,
         **kwargs
     ):
+        if not is_pandas_available():
+            raise ImportError("Pandas is required for the TAPAS tokenizer.")
+
         if additional_special_tokens is not None:
             if empty_token not in additional_special_tokens:
                 additional_special_tokens.append(empty_token)
@@ -539,7 +545,7 @@ def get_special_tokens_mask(
     @add_end_docstrings(TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def __call__(
         self,
-        table: pd.DataFrame,
+        table: "pd.DataFrame",
         queries: Optional[
             Union[
                 TextInput,
@@ -663,7 +669,7 @@ def __call__(
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def batch_encode_plus(
         self,
-        table: pd.DataFrame,
+        table: "pd.DataFrame",
         queries: Optional[
             Union[
                 List[TextInput],
@@ -812,7 +818,7 @@ def _batch_encode_plus(
 
     def _batch_prepare_for_model(
         self,
-        raw_table: pd.DataFrame,
+        raw_table: "pd.DataFrame",
         raw_queries: Union[
             List[TextInput],
             List[PreTokenizedInput],
@@ -884,7 +890,7 @@ def _batch_prepare_for_model(
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
     def encode(
         self,
-        table: pd.DataFrame,
+        table: "pd.DataFrame",
         query: Optional[
             Union[
                 TextInput,
@@ -927,7 +933,7 @@ def encode(
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def encode_plus(
         self,
-        table: pd.DataFrame,
+        table: "pd.DataFrame",
         query: Optional[
             Union[
                 TextInput,
@@ -1010,7 +1016,7 @@ def encode_plus(
 
     def _encode_plus(
         self,
-        table: pd.DataFrame,
+        table: "pd.DataFrame",
         query: Union[
             TextInput,
             PreTokenizedInput,
@@ -1066,7 +1072,7 @@ def _encode_plus(
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def prepare_for_model(
         self,
-        raw_table: pd.DataFrame,
+        raw_table: "pd.DataFrame",
         raw_query: Union[
             TextInput,
             PreTokenizedInput,
@@ -1884,7 +1890,7 @@ def _get_mean_cell_probs(self, probabilities, segment_ids, row_ids, column_ids):
             col = column_ids[i] - 1
             row = row_ids[i] - 1
             coords_to_probs[(col, row)].append(prob)
-        return {coords: torch.as_tensor(cell_probs).mean() for coords, cell_probs in coords_to_probs.items()}
+        return {coords: np.array(cell_probs).mean() for coords, cell_probs in coords_to_probs.items()}
 
     def convert_logits_to_predictions(self, data, logits, logits_agg=None, cell_classification_threshold=0.5):
         """
@@ -1912,11 +1918,8 @@ def convert_logits_to_predictions(self, data, logits, logits_agg=None, cell_clas
             of length ``batch_size``: Predicted aggregation operator indices of the aggregation head.
         """
         # compute probabilities from token logits
-        dist_per_token = torch.distributions.Bernoulli(logits=logits)
-        probabilities = dist_per_token.probs * data["attention_mask"].type(torch.float32).to(
-            dist_per_token.probs.device
-        )
-
+        # DO sigmoid here
+        probabilities = 1 / (1 + np.exp(-logits)) * data["attention_mask"]
         token_types = [
             "segment_ids",
             "column_ids",
@@ -1980,7 +1983,7 @@ def convert_logits_to_predictions(self, data, logits, logits_agg=None, cell_clas
 # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
 class BasicTokenizer(object):
     """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.)
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
 
     Args:
         do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
@@ -1989,8 +1992,10 @@ class BasicTokenizer(object):
             Collection of tokens which will never be split during tokenization. Only has an effect when
             :obj:`do_basic_tokenize=True`
         tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see this
-            `issue <https://github.com/huggingface/transformers/issues/328>`__).
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this `issue
+            <https://github.com/huggingface/transformers/issues/328>`__).
         strip_accents: (:obj:`bool`, `optional`):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for :obj:`lowercase` (as in the original BERT).
@@ -2007,7 +2012,7 @@ def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=
     def tokenize(self, text, never_split=None):
         """
         Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer
+        WordPieceTokenizer.
 
         Args:
             **never_split**: (`optional`) list of str
@@ -2137,12 +2142,13 @@ def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
     def tokenize(self, text):
         """
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary. For example, :obj:`input = "unaffable"` wil return as output
-        :obj:`["un", "##aff", "##able"]`
+        tokenization using the given vocabulary.
+
+        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
 
         Args:
           text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`
+            already been passed through `BasicTokenizer`.
 
         Returns:
           A list of wordpiece tokens.

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
@@ -28,6 +28,7 @@
     _datasets_available,
     _faiss_available,
     _flax_available,
+    _pandas_available,
     _scatter_available,
     _sentencepiece_available,
     _tf_available,
@@ -222,6 +223,19 @@ def require_tokenizers(test_case):
         return test_case
 
 
+def require_pandas(test_case):
+    """
+    Decorator marking a test that requires pandas.
+
+    These tests are skipped when pandas isn't installed.
+
+    """
+    if not _pandas_available:
+        return unittest.skip("test requires pandas")(test_case)
+    else:
+        return test_case
+
+
 def require_scatter(test_case):
     """
     Decorator marking a test that requires PyTorch Scatter. These tests are skipped when PyTorch Scatter isn't