diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3963270e61fa..0af9f9a8be1c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -102,6 +102,7 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
             - save_cache:
                 key: v0.4-{{ checksum "setup.py" }}
                 paths:
@@ -129,6 +130,7 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
             - save_cache:
                   key: v0.4-torch-{{ checksum "setup.py" }}
                   paths:
@@ -210,6 +212,7 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
             - save_cache:
                   key: v0.4-torch-{{ checksum "setup.py" }}
                   paths:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 99a6d1dbc555..7f7f004449e0 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -176,9 +176,9 @@ and conversion utilities for the following models:
 30. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
     Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
     Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-31. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
-    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
-    Francesco Piccinno and Julian Martin Eisenschlos.
+31. `TAPAS <https://huggingface.co/transformers/master/model_doc/tapas.html>`__ released with the paper `TAPAS: Weakly
+    Supervised Table Parsing via Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof
+    Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 32. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
     Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
     Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
@@ -272,6 +272,8 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 8b11f14caff0..0e345bb28ab6 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -216,6 +216,15 @@
     _tokenizers_available = False
 
 
+try:
+    import pandas  # noqa: F401
+
+    _pandas_available = True
+
+except ImportError:
+    _pandas_available = False
+
+
 try:
     import torch_scatter
 
@@ -343,6 +352,10 @@ def is_scatter_available():
     return _scatter_available
 
 
+def is_pandas_available():
+    return _pandas_available
+
+
 def torch_only_method(fn):
     def wrapper(*args, **kwargs):
         if not _torch_available:
diff --git a/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
index b0c27cf3b943..8db2fe2145a2 100644
--- a/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
@@ -19,13 +19,9 @@
 
 import torch
 
-from transformers import (
-    TapasConfig,
-    TapasForQuestionAnswering,
-    TapasForSequenceClassification,
-    TapasModel,
-    load_tf_weights_in_tapas,
-)
+from transformers import TapasForQuestionAnswering  # noqa F401
+from transformers import TapasForSequenceClassification  # noqa F401
+from transformers import TapasConfig, TapasModel, load_tf_weights_in_tapas
 from transformers.utils import logging
 
 
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index 9a50ddb0a5a1..7c5e35ecf7c0 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -358,7 +358,6 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs
         return embeddings
 
 
-# Copied from transformers.modeling_bert.BertSelfAttention with Bert->Tapas
 class TapasSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -437,7 +436,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.modeling_bert.BertSelfOutput
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
 class TapasSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -452,7 +451,7 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-# Copied from transformers.modeling_bert.BertAttention with Bert->Tapas
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Tapas
 class TapasAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -500,7 +499,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.modeling_bert.BertIntermediate
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
 class TapasIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -516,7 +515,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.modeling_bert.BertOutput
+# Copied from transformers.models.bert.modeling_bert.BertOutput
 class TapasOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -531,7 +530,7 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-# Copied from transformers.modeling_bert.BertLayer with Bert->Tapas
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Tapas
 class TapasLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -591,7 +590,6 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
-# Copied from transformers.modeling_bert.BertEncoder with Bert->Tapas
 class TapasEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -656,7 +654,7 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.modeling_bert.BertPooler
+# Copied from transformers.models.bert.modeling_bert.BertPooler
 class TapasPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -681,7 +679,7 @@ class TapasPreTrainedModel(PreTrainedModel):
     config_class = TapasConfig
     base_model_prefix = "tapas"
 
-    # Copied from transformers.modeling_bert.BertPreTrainedModel._init_weights
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
         """ Initialize the weights """
         if isinstance(module, (nn.Linear, nn.Embedding)):
@@ -927,6 +925,9 @@ def __init__(self, config):
     def get_output_embeddings(self):
         return self.lm_head
 
+    def set_output_embeddings(self, word_embeddings):
+        self.lm_head = word_embeddings
+
     @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index 68636e965470..f0c6edf9ddf0 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -26,11 +26,11 @@
 from dataclasses import dataclass
 from typing import Callable, Dict, Generator, List, Optional, Text, Tuple, Union
 
-import pandas as pd
-import torch
+import numpy as np
 
 from transformers import add_end_docstrings
 
+from ...file_utils import is_pandas_available
 from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
@@ -45,6 +45,9 @@
 from ...utils import logging
 
 
+if is_pandas_available():
+    import pandas as pd
+
 logger = logging.get_logger(__name__)
 
 
@@ -307,6 +310,9 @@ def __init__(
         additional_special_tokens: Optional[List[str]] = None,
         **kwargs
     ):
+        if not is_pandas_available():
+            raise ImportError("Pandas is required for the TAPAS tokenizer.")
+
         if additional_special_tokens is not None:
             if empty_token not in additional_special_tokens:
                 additional_special_tokens.append(empty_token)
@@ -539,7 +545,7 @@ def get_special_tokens_mask(
     @add_end_docstrings(TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def __call__(
         self,
-        table: pd.DataFrame,
+        table: "pd.DataFrame",
         queries: Optional[
             Union[
                 TextInput,
@@ -663,7 +669,7 @@ def __call__(
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def batch_encode_plus(
         self,
-        table: pd.DataFrame,
+        table: "pd.DataFrame",
         queries: Optional[
             Union[
                 List[TextInput],
@@ -812,7 +818,7 @@ def _batch_encode_plus(
 
     def _batch_prepare_for_model(
         self,
-        raw_table: pd.DataFrame,
+        raw_table: "pd.DataFrame",
         raw_queries: Union[
             List[TextInput],
             List[PreTokenizedInput],
@@ -884,7 +890,7 @@ def _batch_prepare_for_model(
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
     def encode(
         self,
-        table: pd.DataFrame,
+        table: "pd.DataFrame",
         query: Optional[
             Union[
                 TextInput,
@@ -927,7 +933,7 @@ def encode(
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def encode_plus(
         self,
-        table: pd.DataFrame,
+        table: "pd.DataFrame",
         query: Optional[
             Union[
                 TextInput,
@@ -1010,7 +1016,7 @@ def encode_plus(
 
     def _encode_plus(
         self,
-        table: pd.DataFrame,
+        table: "pd.DataFrame",
         query: Union[
             TextInput,
             PreTokenizedInput,
@@ -1066,7 +1072,7 @@ def _encode_plus(
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def prepare_for_model(
         self,
-        raw_table: pd.DataFrame,
+        raw_table: "pd.DataFrame",
         raw_query: Union[
             TextInput,
             PreTokenizedInput,
@@ -1884,7 +1890,7 @@ def _get_mean_cell_probs(self, probabilities, segment_ids, row_ids, column_ids):
             col = column_ids[i] - 1
             row = row_ids[i] - 1
             coords_to_probs[(col, row)].append(prob)
-        return {coords: torch.as_tensor(cell_probs).mean() for coords, cell_probs in coords_to_probs.items()}
+        return {coords: np.array(cell_probs).mean() for coords, cell_probs in coords_to_probs.items()}
 
     def convert_logits_to_predictions(self, data, logits, logits_agg=None, cell_classification_threshold=0.5):
         """
@@ -1912,11 +1918,8 @@ def convert_logits_to_predictions(self, data, logits, logits_agg=None, cell_clas
             of length ``batch_size``: Predicted aggregation operator indices of the aggregation head.
         """
         # compute probabilities from token logits
-        dist_per_token = torch.distributions.Bernoulli(logits=logits)
-        probabilities = dist_per_token.probs * data["attention_mask"].type(torch.float32).to(
-            dist_per_token.probs.device
-        )
-
+        # DO sigmoid here
+        probabilities = 1 / (1 + np.exp(-logits)) * data["attention_mask"]
         token_types = [
             "segment_ids",
             "column_ids",
@@ -1980,7 +1983,7 @@ def convert_logits_to_predictions(self, data, logits, logits_agg=None, cell_clas
 # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
 class BasicTokenizer(object):
     """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.)
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
 
     Args:
         do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
@@ -1989,8 +1992,10 @@ class BasicTokenizer(object):
             Collection of tokens which will never be split during tokenization. Only has an effect when
             :obj:`do_basic_tokenize=True`
         tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see this
-            `issue <https://github.com/huggingface/transformers/issues/328>`__).
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this `issue
+            <https://github.com/huggingface/transformers/issues/328>`__).
         strip_accents: (:obj:`bool`, `optional`):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for :obj:`lowercase` (as in the original BERT).
@@ -2007,7 +2012,7 @@ def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=
     def tokenize(self, text, never_split=None):
         """
         Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer
+        WordPieceTokenizer.
 
         Args:
             **never_split**: (`optional`) list of str
@@ -2137,12 +2142,13 @@ def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
     def tokenize(self, text):
         """
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary. For example, :obj:`input = "unaffable"` wil return as output
-        :obj:`["un", "##aff", "##able"]`
+        tokenization using the given vocabulary.
+
+        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
 
         Args:
           text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`
+            already been passed through `BasicTokenizer`.
 
         Returns:
           A list of wordpiece tokens.
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 8049344403d7..c664664f60d8 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -28,6 +28,7 @@
     _datasets_available,
     _faiss_available,
     _flax_available,
+    _pandas_available,
     _scatter_available,
     _sentencepiece_available,
     _tf_available,
@@ -222,6 +223,19 @@ def require_tokenizers(test_case):
         return test_case
 
 
+def require_pandas(test_case):
+    """
+    Decorator marking a test that requires pandas.
+
+    These tests are skipped when pandas isn't installed.
+
+    """
+    if not _pandas_available:
+        return unittest.skip("test requires pandas")(test_case)
+    else:
+        return test_case
+
+
 def require_scatter(test_case):
     """
     Decorator marking a test that requires PyTorch Scatter. These tests are skipped when PyTorch Scatter isn't
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index b684d21ebe94..0669a4e09584 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1867,6 +1867,45 @@ def load_tf_weights_in_t5(*args, **kwargs):
     requires_pytorch(load_tf_weights_in_t5)
 
 
+TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TapasForMaskedLM:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class TapasForQuestionAnswering:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class TapasForSequenceClassification:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class TapasModel:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/test_modeling_tapas.py b/tests/test_modeling_tapas.py
index 827df8bc94d0..9ac18c7cfcb6 100644
--- a/tests/test_modeling_tapas.py
+++ b/tests/test_modeling_tapas.py
@@ -411,7 +411,7 @@ class TapasModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TapasModel,
-            # TapasForMaskedLM,
+            TapasForMaskedLM,
             TapasForQuestionAnswering,
             TapasForSequenceClassification,
         )
diff --git a/tests/test_tokenization_tapas.py b/tests/test_tokenization_tapas.py
index 0579bca68205..808dfdd20519 100644
--- a/tests/test_tokenization_tapas.py
+++ b/tests/test_tokenization_tapas.py
@@ -32,12 +32,13 @@
     _is_punctuation,
     _is_whitespace,
 )
-from transformers.testing_utils import is_pt_tf_cross_test, require_tokenizers, require_torch, slow
+from transformers.testing_utils import is_pt_tf_cross_test, require_pandas, require_tokenizers, require_torch, slow
 
 from .test_tokenization_common import TokenizerTesterMixin, filter_non_english, merge_model_tokenizer_mappings
 
 
 @require_tokenizers
+@require_pandas
 class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = TapasTokenizer
     test_rust_tokenizer = False
@@ -132,13 +133,6 @@ def get_input_output_texts(self, tokenizer):
         output_text = "unwanted, running"
         return input_text, output_text
 
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
-
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
             return
@@ -647,40 +641,6 @@ def test_padding_to_max_length(self):
                 assert sequence_length == padded_sequence_right_length
                 assert encoded_sequence == padded_sequence_right
 
-    def test_padding_to_multiple_of(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
-                else:
-                    empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
-                    normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
-                    for key, value in empty_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, "BatchEncoding.{} is not multiple of 8".format(key))
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, "BatchEncoding.{} is not multiple of 8".format(key))
-
-                    normal_tokens = tokenizer("This", pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertNotEqual(len(value) % 8, 0, "BatchEncoding.{} is not multiple of 8".format(key))
-
-                    # Should also work with truncation
-                    normal_tokens = tokenizer("This", padding=True, truncation=True, pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, "BatchEncoding.{} is not multiple of 8".format(key))
-
-                    # truncation to something which is not a multiple of pad_to_multiple_of raises an error
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.__call__,
-                        "This",
-                        padding=True,
-                        truncation=True,
-                        max_length=12,
-                        pad_to_multiple_of=8,
-                    )
-
     def test_call(self):
         # Tests that all call wrap to encode_plus and batch_encode_plus
         tokenizers = self.get_tokenizers(do_lower_case=False)
@@ -3459,3 +3419,7 @@ def test_full_tokenizer(self):
         self.assertListEqual(segment_ids.tolist(), expected_results["segment_ids"])
         self.assertListEqual(column_ids.tolist(), expected_results["column_ids"])
         self.assertListEqual(row_ids.tolist(), expected_results["row_ids"])
+
+    @unittest.skip("Skip this test while all models are still to be uploaded.")
+    def test_pretrained_model_lists(self):
+        pass