[WIP] Enable reproducibility for distributed trainings #16907

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

sgugger merged 10 commits into huggingface:main from hasansalimkanmaz:enable-reproducibility-during-distributed-training

May 11, 2022

docs/source/en/internal/trainer_utils.mdx

-Original file line number
+Diff line change
@@ Expand Up @@
     [[autodoc]] IntervalStrategy
+    [[autodoc]] enable_full_determinism
     [[autodoc]] set_seed
     [[autodoc]] torch_distributed_zero_first
@@ Expand Down @@

src/transformers/__init__.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -372,7 +372,7 @@ @@
             "TrainerControl",
             "TrainerState",
         ],
-        "trainer_utils": ["EvalPrediction", "IntervalStrategy", "SchedulerType", "set_seed"],
+        "trainer_utils": ["EvalPrediction", "IntervalStrategy", "SchedulerType", "enable_full_determinism", "set_seed"],
         "training_args": ["TrainingArguments"],
         "training_args_seq2seq": ["Seq2SeqTrainingArguments"],
         "training_args_tf": ["TFTrainingArguments"],
@@ Expand Down Expand Up / @@ -2809,7 +2809,7 @@ @@
             TrainerControl,
             TrainerState,
         )
-        from .trainer_utils import EvalPrediction, IntervalStrategy, SchedulerType, set_seed
+        from .trainer_utils import EvalPrediction, IntervalStrategy, SchedulerType, enable_full_determinism, set_seed
         from .training_args import TrainingArguments
         from .training_args_seq2seq import Seq2SeqTrainingArguments
         from .training_args_tf import TFTrainingArguments
@@ Expand Down @@

src/transformers/trainer.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -115,10 +115,12 @@ @@
         default_compute_objective,
         default_hp_space,
         denumpify_detensorize,
+        enable_full_determinism,
         find_executable_batch_size,
         get_last_checkpoint,
         has_length,
         number_of_arguments,
+        seed_worker,
         set_seed,
         speed_metrics,
     )
@@ Expand Down Expand Up / @@ -300,7 +302,7 @@ def __init__( @@
                 args = TrainingArguments(output_dir=output_dir)
             self.args = args
             # Seed must be set before instantiating the model when using model
-            set_seed(self.args.seed)
+            enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
             self.hp_name = None
             self.deepspeed = None
             self.is_in_train = False
@@ Expand Down Expand Up / @@ -746,6 +748,7 @@ def get_train_dataloader(self) -> DataLoader: @@
                 drop_last=self.args.dataloader_drop_last,
                 num_workers=self.args.dataloader_num_workers,
                 pin_memory=self.args.dataloader_pin_memory,
+                worker_init_fn=seed_worker,
             )
         def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.Sampler]:
@@ Expand Down Expand Up / @@ -1250,7 +1253,7 @@ def train( @@
             model_reloaded = False
             if self.model_init is not None:
                 # Seed must be set before instantiating the model when using model_init.
-                set_seed(args.seed)
+                enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
                 self.model = self.call_model_init(trial)
                 model_reloaded = True
                 # Reinitializes optimizer and scheduler
@@ Expand Down @@

src/transformers/trainer_tf.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -34,7 +34,14 @@ @@
     from .modeling_tf_utils import TFPreTrainedModel
     from .optimization_tf import GradientAccumulator, create_optimizer
-    from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, IntervalStrategy, PredictionOutput, set_seed
+    from .trainer_utils import (
+        PREFIX_CHECKPOINT_DIR,
+        EvalPrediction,
+        IntervalStrategy,
+        PredictionOutput,
+        enable_full_determinism,
+        set_seed,
+    )
     from .training_args_tf import TFTrainingArguments
     from .utils import logging
@@ Expand Down Expand Up / @@ -134,7 +141,7 @@ def __init__( @@
                     "see https://www.comet.ml/docs/python-sdk/huggingface/"
                 )
-            set_seed(self.args.seed)
+            enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
         def get_train_tfdataset(self) -> tf.data.Dataset:
             """
@@ Expand Down @@

src/transformers/trainer_utils.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -47,6 +47,39 @@ @@
         import tensorflow as tf
+    def seed_worker(_):
+        """
+        Helper function to set worker seed during Dataloader initialization.
+        """
+        worker_seed = torch.initial_seed() % 2**32
+        set_seed(worker_seed)
+    def enable_full_determinism(seed: int):
+        """
+        Helper function for reproducible behavior during distributed training. See
+        - https://pytorch.org/docs/stable/notes/randomness.html for pytorch
+        - https://www.tensorflow.org/api_docs/python/tf/config/experimental/enable_op_determinism for tensorflow
+        """
+        # set seed first
+        set_seed(seed)
+        if is_torch_available():
+            #  Enable PyTorch deterministic mode. This potentially requires either the environment
+            #  variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set,
+            # depending on the CUDA version, so we set them both here
+            os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+            os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+            torch.use_deterministic_algorithms(True)
+            # Enable CUDNN deterministic mode
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = False
+        if is_tf_available():
+            tf.config.experimental.enable_op_determinism()
     def set_seed(seed: int):
         """
         Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch` and/or `tf` (if installed).
@@ Expand Down @@

src/transformers/training_args.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -446,6 +446,9 @@ class TrainingArguments: @@
             auto_find_batch_size (`bool`, *optional*, defaults to `False`)
                 Whether to find a batch size that will fit into memory automatically through exponential decay, avoiding
                 CUDA Out-of-Memory errors. Requires accelerate to be installed (`pip install accelerate`)
+            full_determinism (`bool`, *optional*, defaults to `False`)
+                If `True`, [`enable_full_determinism`] is called instead of [`set_seed`] to ensure reproducible results in
+                distributed training
         """
         output_dir: str = field(
@@ Expand Down Expand Up / @@ -814,6 +817,12 @@ class TrainingArguments: @@
                 "help": "Whether to automatically decrease the batch size in half and rerun the training loop again each time a CUDA Out-of-Memory was reached"
             },
         )
+        full_determinism: bool = field(
+            default=False,
+            metadata={
+                "help": "Whether to call enable_full_determinism instead of set_seed for reproducibility in distributed training"
+            },
+        )
         def __post_init__(self):
             # Handle --use_env option in torch.distributed.launch (local_rank not passed as an arg then).
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[WIP] Enable reproducibility for distributed trainings #16907

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!