Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -521,12 +521,12 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
# Parameters:
lr = 1e-3
max_grad_norm = 1.0
num_total_steps = 1000
num_training_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_total_steps) # 0.1
warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1

### Previously BertAdam optimizer was instantiated like this:
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_training_steps)
### and used like this:
for batch in train_data:
loss = model(batch)
Expand All @@ -535,7 +535,7 @@ for batch in train_data:

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
### and used like this:
for batch in train_data:
model.train()
Expand Down
14 changes: 5 additions & 9 deletions docs/source/main_classes/optimizer_schedules.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,37 +18,33 @@ Schedules
Learning Rate Schedules
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. autoclass:: transformers.ConstantLRSchedule
:members:
.. autofunction:: transformers.get_constant_schedule


.. autoclass:: transformers.WarmupConstantSchedule
:members:
.. autofunction:: transformers.get_constant_schedule_with_warmup

.. image:: /imgs/warmup_constant_schedule.png
:target: /imgs/warmup_constant_schedule.png
:alt:


.. autoclass:: transformers.WarmupCosineSchedule
.. autofunction:: transformers.get_cosine_schedule_with_warmup
:members:

.. image:: /imgs/warmup_cosine_schedule.png
:target: /imgs/warmup_cosine_schedule.png
:alt:


.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
:members:
.. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup

.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
:target: /imgs/warmup_cosine_hard_restarts_schedule.png
:alt:



.. autoclass:: transformers.WarmupLinearSchedule
:members:
.. autofunction:: transformers.get_linear_schedule_with_warmup

.. image:: /imgs/warmup_linear_schedule.png
:target: /imgs/warmup_linear_schedule.png
Expand Down
8 changes: 4 additions & 4 deletions docs/source/migration.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,12 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
# Parameters:
lr = 1e-3
max_grad_norm = 1.0
num_total_steps = 1000
num_training_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_total_steps) # 0.1
warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1

### Previously BertAdam optimizer was instantiated like this:
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, num_training_steps=num_training_steps)
### and used like this:
for batch in train_data:
loss = model(batch)
Expand All @@ -98,7 +98,7 @@ for batch in train_data:

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
### and used like this:
for batch in train_data:
loss = model(batch)
Expand Down
4 changes: 2 additions & 2 deletions examples/contrib/run_openai_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@

from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
WarmupLinearSchedule)
get_linear_schedule_with_warmup)

ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"

Expand Down Expand Up @@ -211,7 +211,7 @@ def tokenize_and_encode(obj):
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

if args.do_train:
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
Expand Down
4 changes: 2 additions & 2 deletions examples/contrib/run_swag.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from transformers import (WEIGHTS_NAME, BertConfig,
BertForMultipleChoice, BertTokenizer)

from transformers import AdamW, WarmupLinearSchedule
from transformers import AdamW, get_linear_schedule_with_warmup

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -322,7 +322,7 @@ def train(args, train_dataset, model, tokenizer):
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.fp16:
try:
from apex import amp
Expand Down
8 changes: 4 additions & 4 deletions examples/distillation/distiller.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
except:
from tensorboardX import SummaryWriter

from transformers import WarmupLinearSchedule
from transformers import get_linear_schedule_with_warmup

from utils import logger
from lm_seqs_dataset import LmSeqsDataset
Expand Down Expand Up @@ -137,9 +137,9 @@ def __init__(self,
betas=(0.9, 0.98))

warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
self.scheduler = WarmupLinearSchedule(self.optimizer,
warmup_steps=warmup_steps,
t_total=num_train_optimization_steps)
self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=num_train_optimization_steps)

if self.fp16:
try:
Expand Down
4 changes: 2 additions & 2 deletions examples/distillation/run_squad_w_distillation.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
XLNetTokenizer,
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)

from transformers import AdamW, WarmupLinearSchedule
from transformers import AdamW, get_linear_schedule_with_warmup

from ..utils_squad import (read_squad_examples, convert_examples_to_features,
RawResult, write_predictions,
Expand Down Expand Up @@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.fp16:
try:
from apex import amp
Expand Down
4 changes: 2 additions & 2 deletions examples/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
DistilBertForSequenceClassification,
DistilBertTokenizer)

from transformers import AdamW, WarmupLinearSchedule
from transformers import AdamW, get_linear_schedule_with_warmup

from transformers import glue_compute_metrics as compute_metrics
from transformers import glue_output_modes as output_modes
Expand Down Expand Up @@ -100,7 +100,7 @@ def train(args, train_dataset, model, tokenizer):
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.fp16:
try:
from apex import amp
Expand Down
4 changes: 2 additions & 2 deletions examples/run_lm_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@

from tqdm import tqdm, trange

from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
BertConfig, BertForMaskedLM, BertTokenizer,
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
Expand Down Expand Up @@ -185,7 +185,7 @@ def train(args, train_dataset, model, tokenizer):
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.fp16:
try:
from apex import amp
Expand Down
4 changes: 2 additions & 2 deletions examples/run_multiple_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
XLNetTokenizer, RobertaConfig,
RobertaForMultipleChoice, RobertaTokenizer)

from transformers import AdamW, WarmupLinearSchedule
from transformers import AdamW, get_linear_schedule_with_warmup

from utils_multiple_choice import (convert_examples_to_features, processors)

Expand Down Expand Up @@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer):
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.fp16:
try:
from apex import amp
Expand Down
4 changes: 2 additions & 2 deletions examples/run_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from tqdm import tqdm, trange
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file

from transformers import AdamW, WarmupLinearSchedule
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer

Expand Down Expand Up @@ -80,7 +80,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.fp16:
try:
from apex import amp
Expand Down
4 changes: 2 additions & 2 deletions examples/run_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
XLNetTokenizer,
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)

from transformers import AdamW, WarmupLinearSchedule
from transformers import AdamW, get_linear_schedule_with_warmup

from utils_squad import (read_squad_examples, convert_examples_to_features,
RawResult, write_predictions,
Expand Down Expand Up @@ -100,7 +100,7 @@ def train(args, train_dataset, model, tokenizer):
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.fp16:
try:
from apex import amp
Expand Down
4 changes: 2 additions & 2 deletions templates/adding_a_new_example_script/run_xxx.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
XLNetTokenizer,
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)

from transformers import AdamW, WarmupLinearSchedule
from transformers import AdamW, get_linear_schedule_with_warmup

from utils_squad import (read_squad_examples, convert_examples_to_features,
RawResult, write_predictions,
Expand Down Expand Up @@ -98,7 +98,7 @@ def train(args, train_dataset, model, tokenizer):
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.fp16:
try:
from apex import amp
Expand Down
4 changes: 2 additions & 2 deletions transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model

# Optimization
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)


# TensorFlow
Expand Down
Loading