huggingface · lvwerra · Oct 15, 2020 · Oct 15, 2020 · Oct 15, 2020 · Oct 15, 2020
diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
@@ -1045,9 +1045,9 @@ def forward(
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Labels for computing the masked language modeling loss.
-            Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
-            with labels in ``[0, ..., config.vocab_size]``.
+            Indices should either be in ``[0, ..., config.vocab_size]`` (see ``input_ids`` docstring).
+            Tokens with indices set to ``config.pad_token_id`` are ignored (masked), the loss is only computed for the tokens
+            with labels in ``[0, ..., config.vocab_size]`` excluding ``config.pad_token_id``.
 
         Returns:
 
@@ -1090,6 +1090,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if labels is not None:
+            assert labels.min() > 0, f'negative labels are not supported, got {labels.min()}'
             use_cache = False
             if decoder_input_ids is None:
                 decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
@@ -1110,8 +1111,7 @@ def forward(
 
         masked_lm_loss = None
         if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # TODO(SS): do we need to ignore pad tokens in labels?
+            loss_fct = CrossEntropyLoss(ignore_index=self.config.pad_token_id)
             masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
 
         if not return_dict: