Lightning-AI · Borda · Jun 21, 2021 · Jun 17, 2021 · Jun 17, 2021 · Jun 17, 2021
@@ -11,7 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 import sys
+from copy import deepcopy
+from unittest import mock
 
 import pytest
 import torch
@@ -116,3 +119,56 @@ def compute(self):
 def test_non_contiguous_tensors():
     """ Test that gather_all operation works for non contiguous tensors """
     torch.multiprocessing.spawn(_test_non_contiguous_tensors, args=(2, ), nprocs=2)
+
+
+def _test_state_dict_is_synced(rank, worldsize, tmpdir):
+    setup_ddp(rank, worldsize)
+
+    class DummyCatMetric(Metric):
+
+        def __init__(self):
+            super().__init__()
+            self.add_state("x", torch.tensor(0), dist_reduce_fx=torch.sum)
+            self.add_state("c", torch.tensor(0), dist_reduce_fx=torch.sum)
+
+        def update(self, x):
+            self.x += x
+            self.c += 1
+
+        def compute(self):
+            return self.x / self.c
+
+    metric = DummyCatMetric()
+    metric.persistent(True)
+
+    steps = 5
+    for i in range(steps):
+        metric(i)
+        state_dict = metric.state_dict()
+        print(state_dict)
+
+        sum = i * (i + 1) / 2
+        assert state_dict["x"] == sum * worldsize
+        assert metric.x == sum
+        assert metric.c == (i + 1)
+        assert state_dict["c"] == metric.c * worldsize
+
+    def reload_state_dict(state_dict, expected_x, expected_c):
+        metric = DummyCatMetric()
+        metric.load_state_dict(state_dict)
+        assert metric.x == expected_x
+        assert metric.c == expected_c
+
+    with mock.patch.dict(os.environ, {"GLOBAL_RANK": str(rank)}):
+        reload_state_dict(deepcopy(state_dict), 20 if not rank else 0, 10 if not rank else 0)
+
+    reload_state_dict(deepcopy(state_dict), 20, 10)
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="DDP not available on windows")
+def test_state_dict_is_synced(tmpdir):
+    """
+    This test asserts taht metric are synced while creating the state
+    dict but restored after to continue accumulation.
+    """
+    torch.multiprocessing.spawn(_test_state_dict_is_synced, args=(2, tmpdir), nprocs=2)
@@ -14,10 +14,12 @@
 import functools
 import inspect
 import operator
+import os
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
+from contextlib import contextmanager
 from copy import deepcopy
-from typing import Any, Callable, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
 from torch import Tensor, nn
@@ -28,6 +30,10 @@
 from torchmetrics.utilities.imports import _LIGHTNING_AVAILABLE, _compare_version
 
 
+def is_distributed_fn() -> bool:
+    return torch.distributed.is_available() and torch.distributed.is_initialized()
+
+
 class Metric(nn.Module, ABC):
     """
     Base class for all metrics present in the Metrics API.
@@ -83,6 +89,7 @@ def __init__(
         self.process_group = process_group
         self.dist_sync_fn = dist_sync_fn
         self._to_sync = True
+        self._restore_cache = True
 
         self._update_signature = inspect.signature(self.update)
         self.update = self._wrap_update(self.update)
@@ -169,6 +176,9 @@ def forward(self, *args, **kwargs):
 
         if self.compute_on_step:
             self._to_sync = self.dist_sync_on_step
+            # skip restore cache operation from compute
+            # as cache is stored below.
+            self._restore_cache = False
 
             # save context before switch
             cache = {attr: getattr(self, attr) for attr in self._defaults}
@@ -181,27 +191,31 @@ def forward(self, *args, **kwargs):
             # restore context
             for attr, val in cache.items():
                 setattr(self, attr, val)
+
+            self._restore_cache = True
             self._to_sync = True
             self._computed = None
 
             return self._forward_cache
 
-    def _sync_dist(self, dist_sync_fn=gather_all_tensors):
+    def _sync_dist(self, dist_sync_fn: Callable = gather_all_tensors, process_group: Optional[Any] = None):
         input_dict = {attr: getattr(self, attr) for attr in self._reductions}
+
         for attr, reduction_fn in self._reductions.items():
             # pre-concatenate metric states that are lists to reduce number of all_gather operations
             if reduction_fn == dim_zero_cat and isinstance(input_dict[attr], list) and len(input_dict[attr]) > 1:
                 input_dict[attr] = [dim_zero_cat(input_dict[attr])]
+
         output_dict = apply_to_collection(
             input_dict,
             Tensor,
             dist_sync_fn,
-            group=self.process_group,
+            group=process_group or self.process_group,
         )
 
         for attr, reduction_fn in self._reductions.items():
             # pre-processing ops (stack or flatten for inputs)
-            if isinstance(output_dict[attr][0], Tensor):
+            if isinstance(output_dict[attr], Sequence) and isinstance(output_dict[attr][0], Tensor):
                 output_dict[attr] = torch.stack(output_dict[attr])
             elif isinstance(output_dict[attr][0], list):
                 output_dict[attr] = _flatten(output_dict[attr])
@@ -221,6 +235,80 @@ def wrapped_func(*args, **kwargs):
 
         return wrapped_func
 
+    def sync(
+        self,
+        dist_sync_fn: Optional[Callable] = None,
+        process_group: Optional[Any] = None,
+        should_sync: bool = True,
+        is_distributed_fn: Optional[Callable] = is_distributed_fn,
+    ) -> Dict[str, Tensor]:
+        """
+        Sync function for manually controlling when metrics states should be synced across processes
+
+        Args:
+            dist_sync_fn: Function to be used to perform states synchronization
+            process_group:
+                Specify the process group on which synchronization is called.
+                default: None (which selects the entire world)
+            should_sync: Whether to apply to state synchronization.
+            is_distributed_fn: Function to determine if we are running inside a distributed setting
+
+        Returns:
+            cache: A dictionarry containing the local metric states. The cache will be empty if sync didn't happen.
+        """
+        is_distributed = is_distributed_fn()
+
+        if dist_sync_fn is None:
+            dist_sync_fn = gather_all_tensors
+
+        cache = {}
+
+        if is_distributed and should_sync:
+            # cache prior to syncing
+            cache = {attr: getattr(self, attr) for attr in self._defaults.keys()}
+
+            # sync
+            self._sync_dist(dist_sync_fn, process_group=process_group)
+
+        return cache
+
+    @contextmanager
+    def sync_context(
+        self,
+        dist_sync_fn: Optional[Callable] = None,
+        process_group: Optional[Any] = None,
+        should_sync: bool = True,
+        restore_cache: bool = True,
+        is_distributed_fn: Optional[Callable] = is_distributed_fn,
+    ) -> None:
+        """
+        Context manager to synchronize the states between processes when running in a distributed setting
+        and restore the local cache states after yielding.
+
+        Args:
+            dist_sync_fn: Function to be used to perform states synchronization
+            process_group:
+                Specify the process group on which synchronization is called.
+                default: None (which selects the entire world)
+            should_sync: Whether to apply to state synchronization.
+            restore_cache: Whether to restore the cache state so that the metrics can
+                continue to be accumulated.
+            is_distributed_fn: Function to determine if we are running inside a distributed setting
+        """
+        cache = self.sync(
+            dist_sync_fn=dist_sync_fn,
+            process_group=process_group,
+            should_sync=should_sync,
+            is_distributed_fn=is_distributed_fn
+        )
+
+        yield
+
+        if cache and restore_cache:
+            # if we synced, restore to cache so that we can continue to accumulate un-synced state
+            for attr, val in cache.items():
+                setattr(self, attr, val)
+
     def _wrap_compute(self, compute):
 
         @functools.wraps(compute)
@@ -236,26 +324,10 @@ def wrapped_func(*args, **kwargs):
             if self._computed is not None:
                 return self._computed
 
-            dist_sync_fn = self.dist_sync_fn
-            if dist_sync_fn is None and torch.distributed.is_available() and torch.distributed.is_initialized():
-                # User provided a bool, so we assume DDP if available
-                dist_sync_fn = gather_all_tensors
-
-            synced = False
-            cache = []
-            if self._to_sync and dist_sync_fn is not None:
-                # cache prior to syncing
-                cache = {attr: getattr(self, attr) for attr in self._defaults}
-
-                # sync
-                self._sync_dist(dist_sync_fn)
-                synced = True
-
-            self._computed = compute(*args, **kwargs)
-            if synced:
-                # if we synced, restore to cache so that we can continue to accumulate un-synced state
-                for attr, val in cache.items():
-                    setattr(self, attr, val)
+            with self.sync_context(
+                dist_sync_fn=self.dist_sync_fn, should_sync=self._to_sync, restore_cache=self._restore_cache
+            ):
+                self._computed = compute(*args, **kwargs)
 
             return self._computed
 
@@ -299,11 +371,12 @@ def clone(self):
 
     def __getstate__(self):
         # ignore update and compute functions for pickling
-        return {k: v for k, v in self.__dict__.items() if k not in ["update", "compute"]}
+        return {k: v for k, v in self.__dict__.items() if k not in ["update", "compute", "_update_signature"]}
 
     def __setstate__(self, state):
         # manually restore update and compute functions for pickling
         self.__dict__.update(state)
+        self._update_signature = inspect.signature(self.update)
         self.update = self._wrap_update(self.update)
         self.compute = self._wrap_compute(self.compute)
 
@@ -341,16 +414,24 @@ def persistent(self, mode: bool = False):
     def state_dict(self, destination=None, prefix="", keep_vars=False):
         destination = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
         # Register metric states to be part of the state_dict
-        for key in self._defaults:
-            if self._persistent[key]:
-                current_val = getattr(self, key)
-                if not keep_vars:
-                    if torch.is_tensor(current_val):
-                        current_val = current_val.detach()
-                    elif isinstance(current_val, list):
-                        current_val = [cur_v.detach() if torch.is_tensor(cur_v) else cur_v for cur_v in current_val]
-                destination[prefix + key] = current_val
-        return destination
+        with self.sync_context(dist_sync_fn=self.dist_sync_fn):
+            for key in self._defaults:
+                if self._persistent[key]:
+                    current_val = getattr(self, key)
+                    if not keep_vars:
+                        if isinstance(current_val, torch.Tensor):
+                            current_val = current_val.detach()
+                        elif isinstance(current_val, list):
+                            current_val = [
+                                cur_v.detach() if isinstance(cur_v, torch.Tensor) else cur_v for cur_v in current_val
+                            ]
+                    destination[prefix + key] = deepcopy(current_val)
+            return destination
+
+    def _on_load_from_state_dict(self, state_dict, key, name) -> None:
+        value = state_dict.pop(name)
+        if os.getenv("GLOBAL_RANK", "0") == "0":
+            setattr(self, key, value)
 
     def _load_from_state_dict(
         self,
@@ -363,10 +444,13 @@ def _load_from_state_dict(
         error_msgs: List[str],
     ) -> None:
         """ Loads metric states from state_dict """
+
+        # only global rank 0 should be reloading the values present in the ``state_dict``
+        # as the state contains synced values across all progress_group
         for key in self._defaults:
             name = prefix + key
             if name in state_dict:
-                setattr(self, key, state_dict.pop(name))
+                self._on_load_from_state_dict(state_dict, key, name)
         super()._load_from_state_dict(
             state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
         )