Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/transformers/models/t5/modeling_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -1746,7 +1746,7 @@ def forward(
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss(ignore_index=-100)
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)).to(labels.device), labels.view(-1))
Comment thread
younesbelkada marked this conversation as resolved.
Outdated
# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666

if not return_dict:
Expand Down
13 changes: 13 additions & 0 deletions src/transformers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,19 @@ def __init__(
else:
self.is_model_parallel = False

if (
getattr(model, "hf_device_map", None) is not None
and len(set(model.hf_device_map.values())) > 1

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe actually check the number of GPUs, cause this could be one GPU and CPU here.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense! Fixed in 5eb72b4

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Multi-device placement should only be on GPUs for naive pipelining to work, right? Offloading to CPU/disk won't work, isn't it the case?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think offloading to CPU/disk won't work yes, I am also unsure if CPU/disk offload training works out of the box with accelerate (without DeepSpeed)

and not self.is_model_parallel
):
self.is_model_parallel = True

# warn users
logger.warning(
Comment thread
younesbelkada marked this conversation as resolved.
Outdated
"You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set",
" to `True` to avoid any unexpected behavior such as device placement mismatching.",
)

# At this stage the model is already loaded
if getattr(model, "is_loaded_in_8bit", False):
if getattr(model, "_is_int8_training_enabled", False):
Expand Down