Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ jobs:
- v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
- run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
- save_cache:
key: v0.4-{{ checksum "setup.py" }}
paths:
Expand Down Expand Up @@ -129,6 +130,7 @@ jobs:
- v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[sklearn,torch,testing,sentencepiece]
- run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
- save_cache:
key: v0.4-torch-{{ checksum "setup.py" }}
paths:
Expand Down Expand Up @@ -210,6 +212,7 @@ jobs:
- v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[sklearn,torch,testing,sentencepiece]
- run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
- save_cache:
key: v0.4-torch-{{ checksum "setup.py" }}
paths:
Expand Down
8 changes: 5 additions & 3 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,9 @@ and conversion utilities for the following models:
30. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
31. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
Francesco Piccinno and Julian Martin Eisenschlos.
31. `TAPAS <https://huggingface.co/transformers/master/model_doc/tapas.html>`__ released with the paper `TAPAS: Weakly
Supervised Table Parsing via Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof
Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
32. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
Expand Down Expand Up @@ -272,6 +272,8 @@ TensorFlow and/or Flax.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| T5 | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| TAPAS | ✅ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| XLM | ✅ | ❌ | ✅ | ✅ | ❌ |
Expand Down
13 changes: 13 additions & 0 deletions src/transformers/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,15 @@
_tokenizers_available = False


try:
import pandas # noqa: F401

_pandas_available = True

except ImportError:
_pandas_available = False


try:
import torch_scatter

Expand Down Expand Up @@ -343,6 +352,10 @@ def is_scatter_available():
return _scatter_available


def is_pandas_available():
return _pandas_available


def torch_only_method(fn):
def wrapper(*args, **kwargs):
if not _torch_available:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,9 @@

import torch

from transformers import (
TapasConfig,
TapasForQuestionAnswering,
TapasForSequenceClassification,
TapasModel,
load_tf_weights_in_tapas,
)
from transformers import TapasForQuestionAnswering # noqa F401
from transformers import TapasForSequenceClassification # noqa F401
from transformers import TapasConfig, TapasModel, load_tf_weights_in_tapas
from transformers.utils import logging


Expand Down
19 changes: 10 additions & 9 deletions src/transformers/models/tapas/modeling_tapas.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,6 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs
return embeddings


# Copied from transformers.modeling_bert.BertSelfAttention with Bert->Tapas
class TapasSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
Expand Down Expand Up @@ -437,7 +436,7 @@ def forward(
return outputs


# Copied from transformers.modeling_bert.BertSelfOutput
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
class TapasSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
Expand All @@ -452,7 +451,7 @@ def forward(self, hidden_states, input_tensor):
return hidden_states


# Copied from transformers.modeling_bert.BertAttention with Bert->Tapas
# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Tapas
class TapasAttention(nn.Module):
def __init__(self, config):
super().__init__()
Expand Down Expand Up @@ -500,7 +499,7 @@ def forward(
return outputs


# Copied from transformers.modeling_bert.BertIntermediate
# Copied from transformers.models.bert.modeling_bert.BertIntermediate
class TapasIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
Expand All @@ -516,7 +515,7 @@ def forward(self, hidden_states):
return hidden_states


# Copied from transformers.modeling_bert.BertOutput
# Copied from transformers.models.bert.modeling_bert.BertOutput
class TapasOutput(nn.Module):
def __init__(self, config):
super().__init__()
Expand All @@ -531,7 +530,7 @@ def forward(self, hidden_states, input_tensor):
return hidden_states


# Copied from transformers.modeling_bert.BertLayer with Bert->Tapas
# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Tapas
class TapasLayer(nn.Module):
def __init__(self, config):
super().__init__()
Expand Down Expand Up @@ -591,7 +590,6 @@ def feed_forward_chunk(self, attention_output):
return layer_output


# Copied from transformers.modeling_bert.BertEncoder with Bert->Tapas
class TapasEncoder(nn.Module):
def __init__(self, config):
super().__init__()
Expand Down Expand Up @@ -656,7 +654,7 @@ def custom_forward(*inputs):
)


# Copied from transformers.modeling_bert.BertPooler
# Copied from transformers.models.bert.modeling_bert.BertPooler
class TapasPooler(nn.Module):
def __init__(self, config):
super().__init__()
Expand All @@ -681,7 +679,7 @@ class TapasPreTrainedModel(PreTrainedModel):
config_class = TapasConfig
base_model_prefix = "tapas"

# Copied from transformers.modeling_bert.BertPreTrainedModel._init_weights
# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
def _init_weights(self, module):
""" Initialize the weights """
if isinstance(module, (nn.Linear, nn.Embedding)):
Expand Down Expand Up @@ -927,6 +925,9 @@ def __init__(self, config):
def get_output_embeddings(self):
return self.lm_head

def set_output_embeddings(self, word_embeddings):
self.lm_head = word_embeddings

@add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
Expand Down
50 changes: 28 additions & 22 deletions src/transformers/models/tapas/tokenization_tapas.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@
from dataclasses import dataclass
from typing import Callable, Dict, Generator, List, Optional, Text, Tuple, Union

import pandas as pd
import torch
import numpy as np

from transformers import add_end_docstrings

from ...file_utils import is_pandas_available
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...tokenization_utils_base import (
ENCODE_KWARGS_DOCSTRING,
Expand All @@ -45,6 +45,9 @@
from ...utils import logging


if is_pandas_available():
import pandas as pd

logger = logging.get_logger(__name__)


Expand Down Expand Up @@ -307,6 +310,9 @@ def __init__(
additional_special_tokens: Optional[List[str]] = None,
**kwargs
):
if not is_pandas_available():
raise ImportError("Pandas is required for the TAPAS tokenizer.")

if additional_special_tokens is not None:
if empty_token not in additional_special_tokens:
additional_special_tokens.append(empty_token)
Expand Down Expand Up @@ -539,7 +545,7 @@ def get_special_tokens_mask(
@add_end_docstrings(TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def __call__(
self,
table: pd.DataFrame,
table: "pd.DataFrame",
queries: Optional[
Union[
TextInput,
Expand Down Expand Up @@ -663,7 +669,7 @@ def __call__(
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def batch_encode_plus(
self,
table: pd.DataFrame,
table: "pd.DataFrame",
queries: Optional[
Union[
List[TextInput],
Expand Down Expand Up @@ -812,7 +818,7 @@ def _batch_encode_plus(

def _batch_prepare_for_model(
self,
raw_table: pd.DataFrame,
raw_table: "pd.DataFrame",
raw_queries: Union[
List[TextInput],
List[PreTokenizedInput],
Expand Down Expand Up @@ -884,7 +890,7 @@ def _batch_prepare_for_model(
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
def encode(
self,
table: pd.DataFrame,
table: "pd.DataFrame",
query: Optional[
Union[
TextInput,
Expand Down Expand Up @@ -927,7 +933,7 @@ def encode(
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def encode_plus(
self,
table: pd.DataFrame,
table: "pd.DataFrame",
query: Optional[
Union[
TextInput,
Expand Down Expand Up @@ -1010,7 +1016,7 @@ def encode_plus(

def _encode_plus(
self,
table: pd.DataFrame,
table: "pd.DataFrame",
query: Union[
TextInput,
PreTokenizedInput,
Expand Down Expand Up @@ -1066,7 +1072,7 @@ def _encode_plus(
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def prepare_for_model(
self,
raw_table: pd.DataFrame,
raw_table: "pd.DataFrame",
raw_query: Union[
TextInput,
PreTokenizedInput,
Expand Down Expand Up @@ -1884,7 +1890,7 @@ def _get_mean_cell_probs(self, probabilities, segment_ids, row_ids, column_ids):
col = column_ids[i] - 1
row = row_ids[i] - 1
coords_to_probs[(col, row)].append(prob)
return {coords: torch.as_tensor(cell_probs).mean() for coords, cell_probs in coords_to_probs.items()}
return {coords: np.array(cell_probs).mean() for coords, cell_probs in coords_to_probs.items()}

def convert_logits_to_predictions(self, data, logits, logits_agg=None, cell_classification_threshold=0.5):
"""
Expand Down Expand Up @@ -1912,11 +1918,8 @@ def convert_logits_to_predictions(self, data, logits, logits_agg=None, cell_clas
of length ``batch_size``: Predicted aggregation operator indices of the aggregation head.
"""
# compute probabilities from token logits
dist_per_token = torch.distributions.Bernoulli(logits=logits)
probabilities = dist_per_token.probs * data["attention_mask"].type(torch.float32).to(
dist_per_token.probs.device
)

# DO sigmoid here
probabilities = 1 / (1 + np.exp(-logits)) * data["attention_mask"]
token_types = [
"segment_ids",
"column_ids",
Expand Down Expand Up @@ -1980,7 +1983,7 @@ def convert_logits_to_predictions(self, data, logits, logits_agg=None, cell_clas
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.)
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

Args:
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Expand All @@ -1989,8 +1992,10 @@ class BasicTokenizer(object):
Collection of tokens which will never be split during tokenization. Only has an effect when
:obj:`do_basic_tokenize=True`
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see this
`issue <https://github.com/huggingface/transformers/issues/328>`__).
Whether or not to tokenize Chinese characters.

This should likely be deactivated for Japanese (see this `issue
<https://github.com/huggingface/transformers/issues/328>`__).
strip_accents: (:obj:`bool`, `optional`):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for :obj:`lowercase` (as in the original BERT).
Expand All @@ -2007,7 +2012,7 @@ def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer
WordPieceTokenizer.

Args:
**never_split**: (`optional`) list of str
Expand Down Expand Up @@ -2137,12 +2142,13 @@ def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary. For example, :obj:`input = "unaffable"` wil return as output
:obj:`["un", "##aff", "##able"]`
tokenization using the given vocabulary.

For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.

Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer`
already been passed through `BasicTokenizer`.

Returns:
A list of wordpiece tokens.
Expand Down
14 changes: 14 additions & 0 deletions src/transformers/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
_datasets_available,
_faiss_available,
_flax_available,
_pandas_available,
_scatter_available,
_sentencepiece_available,
_tf_available,
Expand Down Expand Up @@ -222,6 +223,19 @@ def require_tokenizers(test_case):
return test_case


def require_pandas(test_case):
"""
Decorator marking a test that requires pandas.

These tests are skipped when pandas isn't installed.

"""
if not _pandas_available:
return unittest.skip("test requires pandas")(test_case)
else:
return test_case


def require_scatter(test_case):
"""
Decorator marking a test that requires PyTorch Scatter. These tests are skipped when PyTorch Scatter isn't
Expand Down
Loading