Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix DeepSpeedPlugin with IterableDataset #7362

Merged
merged 11 commits into from
May 7, 2021
Merged
12 changes: 12 additions & 0 deletions pytorch_lightning/plugins/training_type/deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def __init__(
allgather_bucket_size: int = 2e8,
reduce_bucket_size: int = 2e8,
zero_allow_untested_optimizer: bool = True,
logging_batch_size_per_gpu: Optional[int] = 1,
config: Optional[Union[Path, str, dict]] = None,
logging_level: int = logging.WARN,
num_nodes: int = 1,
Expand Down Expand Up @@ -148,6 +149,11 @@ def __init__(
zero_allow_untested_optimizer: Allow untested optimizers to be used with ZeRO. Currently only Adam is a
DeepSpeed supported optimizer when using ZeRO (default: True)

logging_batch_size_per_gpu: Config used in DeepSpeed to calculate verbose timing for logging
on a per sample per second basis (only displayed if logging=logging.INFO).
To obtain accurate logs, set this to the actual per gpu batch size (trainer.batch_size).
If set to None, the logging_batch_size_per_gpu is inferred from the train DataLoader's BatchSampler

config: Pass in a deepspeed formatted config dict,
or path to a deepspeed config: https://www.deepspeed.ai/docs/config-json.
All defaults will be ignored if a config is passed in. (Default: ``None``)
Expand Down Expand Up @@ -182,6 +188,7 @@ def __init__(
when using ZeRO Stage 3. This allows a single weight file to contain the entire model,
rather than individual sharded weight files.
Disable to save sharded states individually. (Default: True)

"""
if not _DEEPSPEED_AVAILABLE:
raise MisconfigurationException(
Expand All @@ -197,6 +204,7 @@ def __init__(
self.config = self._create_default_config(
zero_optimization,
zero_allow_untested_optimizer,
logging_batch_size_per_gpu,
partition_activations=partition_activations,
cpu_checkpointing=cpu_checkpointing,
contiguous_memory_optimization=contiguous_memory_optimization,
Expand Down Expand Up @@ -446,6 +454,7 @@ def _create_default_config(
self,
zero_optimization: bool,
zero_allow_untested_optimizer: bool,
logging_batch_size_per_gpu: Optional[int],
partition_activations: bool,
cpu_checkpointing: bool,
contiguous_memory_optimization: bool,
Expand All @@ -466,6 +475,9 @@ def _create_default_config(
"zero_optimization": zero_kwargs,
**cfg
}
if logging_batch_size_per_gpu is not None:
cfg = {"train_micro_batch_size_per_gpu": logging_batch_size_per_gpu,
**cfg}
return cfg

def _filepath_to_dir(self, filepath: str) -> str:
Expand Down