Lightning-AI · SeanNaren · May 7, 2021 · May 3, 2021 · May 4, 2021 · May 6, 2021
@@ -88,6 +88,7 @@ def __init__(
         allgather_bucket_size: int = 2e8,
         reduce_bucket_size: int = 2e8,
         zero_allow_untested_optimizer: bool = True,
+        logging_batch_size_per_gpu: Optional[int] = 1,
         config: Optional[Union[Path, str, dict]] = None,
         logging_level: int = logging.WARN,
         num_nodes: int = 1,
@@ -148,6 +149,11 @@ def __init__(
             zero_allow_untested_optimizer: Allow untested optimizers to be used with ZeRO. Currently only Adam is a
                 DeepSpeed supported optimizer when using ZeRO (default: True)
 
+            logging_batch_size_per_gpu: Config used in DeepSpeed to calculate verbose timing for logging
+                on a per sample per second basis (only displayed if logging=logging.INFO).
+                To obtain accurate logs, set this to the actual per gpu batch size (trainer.batch_size).
+                If set to None, the logging_batch_size_per_gpu is inferred from the train DataLoader's BatchSampler
+
             config: Pass in a deepspeed formatted config dict,
                 or path to a deepspeed config: https://www.deepspeed.ai/docs/config-json.
                 All defaults will be ignored if a config is passed in. (Default: ``None``)
@@ -182,6 +188,7 @@ def __init__(
                 when using ZeRO Stage 3. This allows a single weight file to contain the entire model,
                 rather than individual sharded weight files.
                 Disable to save sharded states individually. (Default: True)
+
         """
         if not _DEEPSPEED_AVAILABLE:
             raise MisconfigurationException(
@@ -197,6 +204,7 @@ def __init__(
             self.config = self._create_default_config(
                 zero_optimization,
                 zero_allow_untested_optimizer,
+                logging_batch_size_per_gpu,
                 partition_activations=partition_activations,
                 cpu_checkpointing=cpu_checkpointing,
                 contiguous_memory_optimization=contiguous_memory_optimization,
@@ -446,6 +454,7 @@ def _create_default_config(
         self,
         zero_optimization: bool,
         zero_allow_untested_optimizer: bool,
+        logging_batch_size_per_gpu: Optional[int],
         partition_activations: bool,
         cpu_checkpointing: bool,
         contiguous_memory_optimization: bool,
@@ -466,6 +475,9 @@ def _create_default_config(
                 "zero_optimization": zero_kwargs,
                 **cfg
             }
+        if logging_batch_size_per_gpu is not None:
+            cfg = {"train_micro_batch_size_per_gpu": logging_batch_size_per_gpu,
+                   **cfg}
         return cfg
 
     def _filepath_to_dir(self, filepath: str) -> str: