Lightning-AI · SkafteNicki · Mar 27, 2021 · Mar 17, 2021 · Mar 17, 2021 · Mar 17, 2021
@@ -26,6 +26,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Added multilabel support to `ROC` metric ([#114](https://github.com/PyTorchLightning/metrics/pull/114))
 
+
+- Added `BootStrapper` to easely calculate confidence intervals for metrics ([#101](https://github.com/PyTorchLightning/metrics/pull/101))
+
+
 ### Changed
 
 - Changed `ExplainedVariance` from storing all preds/targets to tracking 5 statistics ([#68](https://github.com/PyTorchLightning/metrics/pull/68))

@@ -269,4 +269,14 @@ R2Score
 ~~~~~~~
 
 .. autoclass:: torchmetrics.R2Score
-    :noindex:
+    :noindex:
+
+********
+Wrappers
+********
+
+Modular wrapper metrics are not metrics in themself, but instead take a metric and alter the internal logic 
+of the base metric.
+
+.. autoclass:: torchmetrics.BootStrapper
+    :noindex:
diff --git a/tests/classification/test_matthews_corrcoef.py b/tests/classification/test_matthews_corrcoef.py
@@ -98,6 +98,7 @@ def _sk_matthews_corrcoef_multidim_multiclass(preds, target):
      (_input_mdmc.preds, _input_mdmc.target, _sk_matthews_corrcoef_multidim_multiclass, NUM_CLASSES)]
 )
 class TestMatthewsCorrCoef(MetricTester):
+
     @pytest.mark.parametrize("ddp", [True, False])
     @pytest.mark.parametrize("dist_sync_on_step", [True, False])
     def test_matthews_corrcoef(self, preds, target, sk_metric, num_classes, ddp, dist_sync_on_step):

diff --git a/tests/classification/test_roc.py b/tests/classification/test_roc.py
@@ -73,39 +73,27 @@ def _sk_roc_multidim_multiclass_prob(preds, target, num_classes=1):
 def _sk_roc_multilabel_prob(preds, target, num_classes=1):
     sk_preds = preds.numpy()
     sk_target = target.numpy()
-    return _sk_roc_curve(
-        y_true=sk_target,
-        probas_pred=sk_preds,
-        num_classes=num_classes,
-        multilabel=True
-    )
+    return _sk_roc_curve(y_true=sk_target, probas_pred=sk_preds, num_classes=num_classes, multilabel=True)
 
 
 def _sk_roc_multilabel_multidim_prob(preds, target, num_classes=1):
     sk_preds = preds.transpose(0, 1).reshape(num_classes, -1).transpose(0, 1).numpy()
     sk_target = target.transpose(0, 1).reshape(num_classes, -1).transpose(0, 1).numpy()
-    return _sk_roc_curve(
-        y_true=sk_target,
-        probas_pred=sk_preds,
-        num_classes=num_classes,
-        multilabel=True
+    return _sk_roc_curve(y_true=sk_target, probas_pred=sk_preds, num_classes=num_classes, multilabel=True)
+
+
+@pytest.mark.parametrize("preds, target, sk_metric, num_classes", [
+    (_input_binary_prob.preds, _input_binary_prob.target, _sk_roc_binary_prob, 1),
+    (_input_mcls_prob.preds, _input_mcls_prob.target, _sk_roc_multiclass_prob, NUM_CLASSES),
+    (_input_mdmc_prob.preds, _input_mdmc_prob.target, _sk_roc_multidim_multiclass_prob, NUM_CLASSES),
+    (_input_multilabel_prob.preds, _input_multilabel_prob.target, _sk_roc_multilabel_prob, NUM_CLASSES),
+    (
+        _input_multilabel_multidim_prob.preds,
+        _input_multilabel_multidim_prob.target,
+        _sk_roc_multilabel_multidim_prob,
+        NUM_CLASSES
     )
-
-
-@pytest.mark.parametrize(
-    "preds, target, sk_metric, num_classes", [
-        (_input_binary_prob.preds, _input_binary_prob.target, _sk_roc_binary_prob, 1),
-        (_input_mcls_prob.preds, _input_mcls_prob.target, _sk_roc_multiclass_prob, NUM_CLASSES),
-        (_input_mdmc_prob.preds, _input_mdmc_prob.target, _sk_roc_multidim_multiclass_prob, NUM_CLASSES),
-        (_input_multilabel_prob.preds, _input_multilabel_prob.target, _sk_roc_multilabel_prob, NUM_CLASSES),
-        (
-            _input_multilabel_multidim_prob.preds,
-            _input_multilabel_multidim_prob.target,
-            _sk_roc_multilabel_multidim_prob,
-            NUM_CLASSES
-        )
-    ]
-)
+])
 class TestROC(MetricTester):
 
     @pytest.mark.parametrize("ddp", [True, False])

diff --git a/tests/helpers/testers.py b/tests/helpers/testers.py
@@ -107,7 +107,11 @@ def _class_test(
     if not metric_args:
         metric_args = {}
     # Instanciate lightning metric
-    metric = metric_class(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args)
+    metric = metric_class(
+        compute_on_step=check_dist_sync_on_step or check_batch,
+        dist_sync_on_step=dist_sync_on_step,
+        **metric_args
+    )
 
     # verify metrics work after being loaded from pickled state
     pickled_metric = pickle.dumps(metric)

@@ -0,0 +1,97 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pytest
+import torch
+from sklearn.metrics import precision_score, recall_score
+
+from torchmetrics.classification import Precision, Recall
+from torchmetrics.utilities import apply_to_collection
+from torchmetrics.utilities.imports import _TORCH_GREATER_EQUAL_1_7
+from torchmetrics.wrappers.bootstrapping import BootStrapper, _bootstrap_sampler
+
+_preds = torch.randint(10, (10, 32))
+_target = torch.randint(10, (10, 32))
+
+
+class TestBootStrapper(BootStrapper):
+    """ For testing purpose, we subclass the bootstrapper class so we can get the exact permutation
+        the class is creating
+    """
+
+    def update(self, *args, **kwargs):
+        self.out = []
+        for idx in range(self.num_bootstraps):
+            new_args = apply_to_collection(args, torch.Tensor, _bootstrap_sampler, generator=self.generator)
+            new_kwargs = apply_to_collection(kwargs, torch.Tensor, _bootstrap_sampler, generator=self.generator)
+            self.metrics[idx].update(*new_args, **new_kwargs)
+            self.out.append(new_args)
+
+
+def test_bootstrap_sampler():
+    """ make sure that the bootstrap sampler works as intended """
+    old_samples = torch.randn(5, 2)
+
+    # make sure that the new samples are only made up of old samples
+    new_samples = _bootstrap_sampler(old_samples)
+    for ns in new_samples:
+        assert ns in old_samples
+
+    # make sure some samples are also sampled twice
+    found_one = False
+    for os in old_samples:
+        cond = os == new_samples
+        print(cond.sum())
+        if cond.sum() > 2:
+            found_one = True
+    assert found_one, "resampling did not work because no samples were sampled twice"
+
+
+@pytest.mark.parametrize(
+    "metric, sk_metric", [[Precision(average='micro'), precision_score], [Recall(average='micro'), recall_score]]
+)
+def test_bootstrap(metric, sk_metric):
+    """ Test that the different bootstraps gets updated as we expected and that the compute method works """
+    if _TORCH_GREATER_EQUAL_1_7:
+        bootstrapper = TestBootStrapper(metric, mean=True, std=True, quantile=torch.tensor([0.05, 0.95]), raw=True)
+    else:
+        bootstrapper = TestBootStrapper(metric, mean=True, std=True, raw=True)
+
+    collected_preds = [[] for _ in range(10)]
+    collected_target = [[] for _ in range(10)]
+    for p, t in zip(_preds, _target):
+        bootstrapper.update(p, t)
+
+        for i, o in enumerate(bootstrapper.out):
+
+            collected_preds[i].append(o[0])
+            collected_target[i].append(o[1])
+
+    collected_preds = [torch.cat(cp) for cp in collected_preds]
+    collected_target = [torch.cat(ct) for ct in collected_target]
+
+    sk_scores = [sk_metric(ct, cp, average='micro') for ct, cp in zip(collected_target, collected_preds)]
+
+    output = bootstrapper.compute()
+    # quantile only avaible for pytorch v1.7 and forward
+    if _TORCH_GREATER_EQUAL_1_7:
+        pl_mean, pl_std, pl_quantile, pl_raw = output
+        assert np.allclose(pl_quantile[0], np.quantile(sk_scores, 0.05))
+        assert np.allclose(pl_quantile[1], np.quantile(sk_scores, 0.95))
+    else:
+        pl_mean, pl_std, pl_raw = output
+
+    assert np.allclose(pl_mean, np.mean(sk_scores))
+    assert np.allclose(pl_std, np.std(sk_scores, ddof=1))
+    assert np.allclose(pl_raw, sk_scores)
@@ -49,3 +49,4 @@
     R2Score,
 )
 from torchmetrics.retrieval import RetrievalMAP  # noqa: F401 E402
+from torchmetrics.wrappers import BootStrapper  # noqa: F401 E402
diff --git a/torchmetrics/classification/matthews_corrcoef.py b/torchmetrics/classification/matthews_corrcoef.py
@@ -75,6 +75,7 @@ class MatthewsCorrcoef(Metric):
         tensor(0.5774)
 
     """
+
     def __init__(
         self,
         num_classes: int,

diff --git a/torchmetrics/classification/roc.py b/torchmetrics/classification/roc.py
@@ -110,6 +110,7 @@ class ROC(Metric):
          tensor([1.1837, 0.1837, 0.1338, 0.1183, 0.1138])]
 
     """
+
     def __init__(
         self,
         num_classes: Optional[int] = None,

diff --git a/torchmetrics/functional/classification/matthews_corrcoef.py b/torchmetrics/functional/classification/matthews_corrcoef.py
@@ -24,15 +24,10 @@ def _matthews_corrcoef_compute(confmat: Tensor) -> Tensor:
     pk = confmat.sum(dim=1).float()
     c = torch.trace(confmat).float()
     s = confmat.sum().float()
-    return (c * s - sum(tk * pk)) / (torch.sqrt(s ** 2 - sum(pk * pk)) * torch.sqrt(s ** 2 - sum(tk * tk)))
+    return (c * s - sum(tk * pk)) / (torch.sqrt(s**2 - sum(pk * pk)) * torch.sqrt(s**2 - sum(tk * tk)))
 
 
-def matthews_corrcoef(
-        preds: Tensor,
-        target: Tensor,
-        num_classes: int,
-        threshold: float = 0.5
-) -> Tensor:
+def matthews_corrcoef(preds: Tensor, target: Tensor, num_classes: int, threshold: float = 0.5) -> Tensor:
     r"""
     Calculates `Matthews correlation coefficient
     <https://en.wikipedia.org/wiki/Matthews_correlation_coefficient>`_ that measures

@@ -1,7 +1,47 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Import utilities"""
+import importlib
+import operator
 from distutils.version import LooseVersion
 
-import torch
+from pkg_resources import DistributionNotFound
 
-_TORCH_LOWER_1_4 = LooseVersion(torch.__version__) < LooseVersion("1.4.0")
-_TORCH_LOWER_1_5 = LooseVersion(torch.__version__) < LooseVersion("1.5.0")
-_TORCH_LOWER_1_6 = LooseVersion(torch.__version__) < LooseVersion("1.6.0")
+
+def _compare_version(package: str, op, version) -> bool:
+    """
+    Compare package version with some requirements
+
+    >>> _compare_version("torch", operator.ge, "0.1")
+    True
+    """
+    try:
+        pkg = importlib.import_module(package)
+    except (ModuleNotFoundError, DistributionNotFound):
+        return False
+    try:
+        pkg_version = LooseVersion(pkg.__version__)
+    except AttributeError:
+        return False
+    if not (hasattr(pkg_version, "vstring") and hasattr(pkg_version, "version")):
+        # this is mock by sphinx, so it shall return True ro generate all summaries
+        return True
+    return op(pkg_version, LooseVersion(version))
+
+
+_TORCH_LOWER_1_4 = _compare_version("torch", operator.lt, "1.4.0")
+_TORCH_LOWER_1_5 = _compare_version("torch", operator.lt, "1.5.0")
+_TORCH_LOWER_1_6 = _compare_version("torch", operator.lt, "1.6.0")
+_TORCH_GREATER_EQUAL_1_6 = _compare_version("torch", operator.ge, "1.6.0")
+_TORCH_GREATER_EQUAL_1_7 = _compare_version("torch", operator.ge, "1.7.0")
@@ -0,0 +1,14 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torchmetrics.wrappers.bootstrapping import BootStrapper  # noqa: F401
-Original file line number
+Diff line change
@@ Expand Up / @@ -75,6 +75,7 @@ class MatthewsCorrcoef(Metric): @@
             tensor(0.5774)
         """
         def __init__(
             self,
             num_classes: int,
@@ Expand Down @@