Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
4bc3b3f
Fixing roberta for slow-fast tests
thomwolf Oct 26, 2020
1ce1c63
WIP getting equivalence on pipelines
thomwolf Oct 26, 2020
15350e8
slow-to-fast equivalence - working on question-answering pipeline
thomwolf Oct 27, 2020
449e346
optional FAISS tests
thomwolf Nov 2, 2020
eb375bc
Pipeline Q&A
thomwolf Nov 9, 2020
a4cb7f6
Merge branch 'master' into slow-fast-comparison-pipelines
thomwolf Nov 9, 2020
3367593
Move pipeline tests to their own test job again
thomwolf Nov 9, 2020
36e0900
update tokenizer to add sequence id methods
thomwolf Nov 9, 2020
ef4919b
update to tokenizers 0.9.4
thomwolf Nov 10, 2020
dab8168
set sentencepiecce as optional
thomwolf Nov 10, 2020
9e72b29
Merge branch 'master' into slow-fast-comparison-pipelines
thomwolf Nov 10, 2020
84bc244
clean up squad
thomwolf Nov 10, 2020
751ee69
clean up pipelines to use sequence_ids
thomwolf Nov 10, 2020
0e8d7f7
style/quality
thomwolf Nov 10, 2020
eb72b1f
wording
thomwolf Nov 10, 2020
16da2c5
Switch to use_fast = True by default
thomwolf Nov 10, 2020
0f03fdb
update tests for use_fast at True by default
thomwolf Nov 10, 2020
87cb801
fix rag tokenizer test
thomwolf Nov 10, 2020
77ee69f
removing protobuf from required dependencies
thomwolf Nov 10, 2020
1483927
fix NER test for use_fast = True by default
thomwolf Nov 10, 2020
b115646
fixing example tests (Q&A examples use slow tokenizers for now)
thomwolf Nov 10, 2020
56f77e8
protobuf in main deps extras["sentencepiece"] and example deps
thomwolf Nov 10, 2020
6894fc0
fix protobug install test
thomwolf Nov 10, 2020
2441d40
try to fix seq2seq by switching to slow tokenizers for now
thomwolf Nov 10, 2020
fc2daad
Update src/transformers/tokenization_utils_base.py
thomwolf Nov 10, 2020
002848b
Update src/transformers/tokenization_utils_base.py
thomwolf Nov 10, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion examples/question-answering/run_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,7 @@ def main():
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
do_lower_case=args.do_lower_case,
cache_dir=args.cache_dir if args.cache_dir else None,
use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
)
model = AutoModelForQuestionAnswering.from_pretrained(
args.model_name_or_path,
Expand Down Expand Up @@ -778,7 +779,10 @@ def main():

# Load a trained model and vocabulary that you have fine-tuned
model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir) # , force_download=True)
tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)

# SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
# So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out
tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case, use_fast=False)
model.to(args.device)

# Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
Expand Down
1 change: 1 addition & 0 deletions examples/question-answering/run_squad_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def main():
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
)
model = AutoModelForQuestionAnswering.from_pretrained(
model_args.model_name_or_path,
Expand Down
1 change: 1 addition & 0 deletions examples/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ fire
pytest
conllu
sentencepiece != 0.1.92
protobuf
2 changes: 1 addition & 1 deletion examples/seq2seq/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def test_distributed_sortish_sampler_splits_indices_between_procs(self):
)
@require_torch_non_multigpu_but_fix_me
def test_dataset_kwargs(self, tok_name):
tokenizer = AutoTokenizer.from_pretrained(tok_name)
tokenizer = AutoTokenizer.from_pretrained(tok_name, use_fast=False)
if tok_name == MBART_TINY:
train_dataset = Seq2SeqDataset(
tokenizer,
Expand Down
9 changes: 3 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,12 @@
extras["retrieval"] = ["faiss-cpu", "datasets"]
extras["flax"] = ["jaxlib==0.1.55", "jax>=0.2.0", "flax==0.2.2"]

extras["tokenizers"] = ["tokenizers==0.9.2"]
extras["tokenizers"] = ["tokenizers==0.9.4"]
extras["onnxruntime"] = ["onnxruntime>=1.4.0", "onnxruntime-tools>=1.4.2"]

extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]

extras["sentencepiece"] = ["sentencepiece==0.1.91"]
extras["sentencepiece"] = ["sentencepiece==0.1.91", "protobuf"]
extras["retrieval"] = ["faiss-cpu", "datasets"]
extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"]
# sphinx-rtd-theme==0.5.0 introduced big changes in the style.
Expand Down Expand Up @@ -129,7 +129,7 @@
packages=find_packages("src"),
install_requires=[
"numpy",
"tokenizers == 0.9.3",
"tokenizers == 0.9.4",
# dataclasses for Python versions that don't have it
"dataclasses;python_version<'3.7'",
# utilities from PyPA to e.g. compare versions
Expand All @@ -142,9 +142,6 @@
"tqdm >= 4.27",
# for OpenAI GPT
"regex != 2019.12.17",
# for SentencePiece models
"sentencepiece == 0.1.91",
"protobuf",
# for XLM
"sacremoses",
],
Expand Down
20 changes: 9 additions & 11 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import BPE, Unigram, WordPiece

# from transformers.tokenization_openai import OpenAIGPTTokenizer
from transformers.utils import sentencepiece_model_pb2 as model

from .file_utils import requires_sentencepiece
from .file_utils import requires_protobuf, requires_sentencepiece


class SentencePieceExtractor:
Expand Down Expand Up @@ -64,12 +61,6 @@ def check_number_comma(piece: str) -> bool:
return len(piece) < 2 or piece[-1] != "," or not piece[-2].isdigit()


def get_proto(filename: str):
m = model.ModelProto()
m.ParseFromString(open(filename, "rb").read())
return m


class Converter:
def __init__(self, original_tokenizer):
self.original_tokenizer = original_tokenizer
Expand Down Expand Up @@ -292,8 +283,15 @@ def converted(self) -> Tokenizer:

class SpmConverter(Converter):
def __init__(self, *args):
requires_protobuf(self)

super().__init__(*args)
self.proto = get_proto(self.original_tokenizer.vocab_file)

from .utils import sentencepiece_model_pb2 as model_pb2

m = model_pb2.ModelProto()
m.ParseFromString(open(self.original_tokenizer.vocab_file, "rb").read())
self.proto = m

def vocab(self, proto):
return [(piece.piece, piece.score) for piece in proto.pieces]
Expand Down
6 changes: 5 additions & 1 deletion src/transformers/data/processors/squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from ...file_utils import is_tf_available, is_torch_available
from ...tokenization_bert import whitespace_tokenize
from ...tokenization_utils_base import PreTrainedTokenizerBase, TruncationStrategy
from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
from ...utils import logging
from .utils import DataProcessor

Expand Down Expand Up @@ -765,6 +765,7 @@ class SquadFeatures:
token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
start_position: start of the answer token index
end_position: end of the answer token index
encoding: optionally store the BatchEncoding with the fast-tokenizer alignement methods.
"""

def __init__(
Expand All @@ -784,6 +785,7 @@ def __init__(
end_position,
is_impossible,
qas_id: str = None,
encoding: BatchEncoding = None,
):
self.input_ids = input_ids
self.attention_mask = attention_mask
Expand All @@ -803,6 +805,8 @@ def __init__(
self.is_impossible = is_impossible
self.qas_id = qas_id

self.encoding = encoding


class SquadResult:
"""
Expand Down
27 changes: 27 additions & 0 deletions src/transformers/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,15 @@
_sentencepiece_available = False


try:
import google.protobuf # noqa: F401

_protobuf_available = True

except ImportError:
_protobuf_available = False


try:
import tokenizers # noqa: F401

Expand Down Expand Up @@ -270,6 +279,10 @@ def is_sentencepiece_available():
return _sentencepiece_available


def is_protobuf_available():
return _protobuf_available


def is_tokenizers_available():
return _tokenizers_available

Expand Down Expand Up @@ -330,6 +343,14 @@ def wrapper(*args, **kwargs):
"""


# docstyle-ignore
PROTOBUF_IMPORT_ERROR = """
{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment.
"""


# docstyle-ignore
FAISS_IMPORT_ERROR = """
{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
Expand Down Expand Up @@ -420,6 +441,12 @@ def requires_sentencepiece(obj):
raise ImportError(SENTENCEPIECE_IMPORT_ERROR.format(name))


def requires_protobuf(obj):
name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
if not is_protobuf_available():
raise ImportError(PROTOBUF_IMPORT_ERROR.format(name))


def add_start_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
Expand Down
Loading