From 9658d27e0ae02dcc28f7d9c07b461a209824f9eb Mon Sep 17 00:00:00 2001 From: Franck Mamalet <49721198+franckma31@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:42:48 +0100 Subject: [PATCH] update docstring in losses --- deel/torchlip/functional.py | 38 +++---- deel/torchlip/modules/loss.py | 191 +++++++++++++++++++++++++--------- 2 files changed, 160 insertions(+), 69 deletions(-) diff --git a/deel/torchlip/functional.py b/deel/torchlip/functional.py index f755480..9b120ea 100644 --- a/deel/torchlip/functional.py +++ b/deel/torchlip/functional.py @@ -287,9 +287,7 @@ def apply_reduction(val: torch.Tensor, reduction: str) -> torch.Tensor: return red(val) -def kr_loss( - input: torch.Tensor, target: torch.Tensor, multi_gpu=False, true_values=None -) -> torch.Tensor: +def kr_loss(input: torch.Tensor, target: torch.Tensor, multi_gpu=False) -> torch.Tensor: r""" Loss to estimate the Wasserstein-1 distance using Kantorovich-Rubinstein duality, as per @@ -300,12 +298,19 @@ def kr_loss( - \underset{\mathbf{x}\sim{}\nu}{\mathbb{E}}[f(\mathbf{x})] where :math:`\mu` and :math:`\nu` are the distributions corresponding to the - two possible labels as specific by ``true_values``. + two possible labels as specific by their sign. + + `target` accepts label values in (0, 1), (-1, 1), or pre-processed with the + `deel.torchlip.functional.process_labels_for_multi_gpu()` function. + + Using a multi-GPU/TPU strategy requires to set `multi_gpu` to True and to + pre-process the labels `target` with the + `deel.torchlip.functional.process_labels_for_multi_gpu()` function. Args: input: Tensor of arbitrary shape. target: Tensor of the same shape as input. - true_values: depreciated (target>0 is used) + multi_gpu (bool): set to True when running on multi-GPU/TPU Returns: The Wasserstein-1 loss between ``input`` and ``target``. @@ -316,9 +321,7 @@ def kr_loss( return kr_loss_standard(input, target) -def kr_loss_standard( - input: torch.Tensor, target: torch.Tensor, true_values=None -) -> torch.Tensor: +def kr_loss_standard(input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: r""" Loss to estimate the Wasserstein-1 distance using Kantorovich-Rubinstein duality, as per @@ -329,12 +332,13 @@ def kr_loss_standard( - \underset{\mathbf{x}\sim{}\nu}{\mathbb{E}}[f(\mathbf{x})] where :math:`\mu` and :math:`\nu` are the distributions corresponding to the - two possible labels as specific by ``true_values``. + two possible labels as specific by their sign. + + `target` accepts label values in (0, 1), (-1, 1) Args: input: Tensor of arbitrary shape. target: Tensor of the same shape as input. - true_values: depreciated (target>0 is used) Returns: The Wasserstein-1 loss between ``input`` and ``target``. @@ -384,7 +388,6 @@ def neg_kr_loss( input: torch.Tensor, target: torch.Tensor, multi_gpu=False, - true_values=None, ) -> torch.Tensor: """ Loss to estimate the negative wasserstein-1 distance using Kantorovich-Rubinstein @@ -393,7 +396,7 @@ def neg_kr_loss( Args: input: Tensor of arbitrary shape. target: Tensor of the same shape as input. - true_values: depreciated (target>0 is used) + multi_gpu (bool): set to True when running on multi-GPU/TPU Returns: The negative Wasserstein-1 loss between ``input`` and ``target``. @@ -437,7 +440,6 @@ def hkr_loss( alpha: float, min_margin: float = 1.0, multi_gpu=False, - true_values=None, ) -> torch.Tensor: """ Loss to estimate the wasserstein-1 distance with a hinge regularization using @@ -446,9 +448,9 @@ def hkr_loss( Args: input: Tensor of arbitrary shape. target: Tensor of the same shape as input. - alpha: Regularization factor between the hinge and the KR loss. + alpha: Regularization factor ([0,1]) between the hinge and the KR loss. min_margin: Minimal margin for the hinge loss. - true_values: tuple containing the two label for each predicted class. + multi_gpu (bool): set to True when running on multi-GPU/TPU Returns: The regularized Wasserstein-1 loss. @@ -478,7 +480,7 @@ def hinge_multiclass_loss( """ Loss to estimate the Hinge loss in a multiclass setup. It compute the elementwise hinge term. Note that this formulation differs from the - one commonly found in tensorflow/pytorch (with marximise the difference + one commonly found in tensorflow/pytorch (with maximise the difference between the two largest logits). This formulation is consistent with the binary classification loss used in a multiclass fashion. @@ -515,9 +517,9 @@ def hkr_multiclass_loss( Args: input: Tensor of arbitrary shape. target: Tensor of the same shape as input. - alpha: Regularization factor between the hinge and the KR loss. + alpha: Regularization factor ([0,1]) between the hinge and the KR loss. min_margin: Minimal margin for the hinge loss. - true_values: tuple containing the two label for each predicted class. + multi_gpu (bool): set to True when running on multi-GPU/TPU Returns: The regularized Wasserstein-1 loss. diff --git a/deel/torchlip/modules/loss.py b/deel/torchlip/modules/loss.py index 330796b..a23a3c7 100644 --- a/deel/torchlip/modules/loss.py +++ b/deel/torchlip/modules/loss.py @@ -33,16 +33,35 @@ class KRLoss(torch.nn.Module): - """ - Loss that estimates the Wasserstein-1 distance using the Kantorovich-Rubinstein - duality. - """ - def __init__(self, multi_gpu=False, reduction: str = "mean", true_values=None): - """ + r""" + Loss that estimates the Wasserstein-1 distance using the Kantorovich-Rubinstein + duality. + The Kantorovich-Rubinstein duality is formulated as following: + + $$ + W_1(\mu, \nu) = + \sup_{f \in Lip_1(\Omega)} \underset{\textbf{x} \sim \mu}{\mathbb{E}} + \left[f(\textbf{x} )\right] - + \underset{\textbf{x} \sim \nu}{\mathbb{E}} \left[f(\textbf{x} )\right] + $$ + + Where mu and nu stands for the two distributions, the distribution where the + label is 1 and the rest. + + Note that `input` and `target` must be of rank 2: (batch_size, 1) or + (batch_size, C) for multilabel classification (with C categories). + `target` accepts label values in (0, 1), (-1, 1), or pre-processed with the + `deel.torchlip.functional.process_labels_for_multi_gpu()` function. + + Using a multi-GPU/TPU strategy requires to set `multi_gpu` to True and to + pre-process the labels `target` with the + `deel.torchlip.functional.process_labels_for_multi_gpu()` function. + Args: multi_gpu (bool): set to True when running on multi-GPU/TPU - reduction: passed to tf.keras.Loss constructor + reduction: type of reduction applied to the output. possible values are + 'none' | 'mean' | 'sum' | 'auto'; default is 'mean' ('auto is 'mean') true_values: depreciated. """ super().__init__() @@ -58,16 +77,16 @@ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: class NegKRLoss(torch.nn.Module): - """ - Loss that estimates the negative of the Wasserstein-1 distance using - the Kantorovich-Rubinstein duality. - """ def __init__(self, multi_gpu=False, reduction: str = "mean", true_values=None): """ + Loss that estimates the negative of the Wasserstein-1 distance using + the Kantorovich-Rubinstein duality. See `KRLoss` for more details. + Args: multi_gpu (bool): set to True when running on multi-GPU/TPU - reduction: passed to tf.keras.Loss constructor + reduction: type of reduction applied to the output. possible values are + 'none' | 'mean' | 'sum' | 'auto'; default is 'mean' ('auto is 'mean') true_values: depreciated. """ super().__init__() @@ -83,14 +102,14 @@ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: class HingeMarginLoss(torch.nn.Module): - """ - Hinge margin loss. - """ - def __init__(self, min_margin: float = 1.0, reduction: str = "mean"): """ + Hinge margin loss. + Args: min_margin: The minimal margin to enforce. + reduction: type of reduction applied to the output. possible values are + 'none' | 'mean' | 'sum' | 'auto'; default is 'mean' ('auto is 'mean') """ super().__init__() self.reduction = reduction @@ -102,11 +121,6 @@ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: class HKRLoss(torch.nn.Module): - """ - Loss that estimates the Wasserstein-1 distance using the Kantorovich-Rubinstein - duality with a hinge regularization. - """ - def __init__( self, alpha: float, @@ -116,11 +130,32 @@ def __init__( true_values=None, ): """ + Loss that estimates the Wasserstein-1 distance using the Kantorovich-Rubinstein + duality with a hinge regularization. + + [1] M. Serrurier, F. Mamalet, et al. «Achieving robustness in classification + using optimal transport with hinge regularization», 2021. + + Note that `input` and `target` must be of rank 2: (batch_size, 1) or + (batch_size, C) for multilabel classification (with C categories). + `target` accepts label values in (0, 1), (-1, 1), or pre-processed with the + `deel.torchlip.functional.process_labels_for_multi_gpu()` function. + + Using a multi-GPU/TPU strategy requires to set `multi_gpu` to True and to + pre-process the labels `target` with the + `deel.torchlip.functional.process_labels_for_multi_gpu()` function. + + the regularization factor `alpha` is a value between 0 and 1. It controls the + trade-off between the hinge and the KR loss. When `alpha` is 0, the loss is + equivalent to the KR loss, and when `alpha` is 1, the loss is equivalent to the + hinge loss. + Args: alpha: Regularization factor ([0,1]) between the hinge and the KR loss. min_margin: Minimal margin for the hinge loss. multi_gpu (bool): set to True when running on multi-GPU/TPU - reduction: passed to tf.keras.Loss constructor + reduction: type of reduction applied to the output. possible values are + 'none' | 'mean' | 'sum' | 'auto'; default is 'mean' ('auto is 'mean') true_values: depreciated. """ super().__init__() @@ -147,11 +182,24 @@ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: class KRMulticlassLoss(torch.nn.Module): - """ - The Wasserstein multiclass loss between ``input`` and ``target``. - """ - def __init__(self, multi_gpu=False, reduction: str = "mean"): + r""" + Loss to estimate average of Wasserstein-1 distance using Kantorovich-Rubinstein + duality over outputs. In this multiclass setup, the KR term is computed for each + class and then averaged. + + Note that`target` should be one-hot encoded or pre-processed with the + `deel.torchlip.functional.process_labels_for_multi_gpu()` function. + + Using a multi-GPU/TPU strategy requires to set `multi_gpu` to True and to + pre-process the labels `target` with the + `deel.torchlip.functional.process_labels_for_multi_gpu()` function. + + Args: + multi_gpu (bool): set to True when running on multi-GPU/TPU + reduction: type of reduction applied to the output. possible values are + 'none' | 'mean' | 'sum' | 'auto'; default is 'mean' ('auto is 'mean') + """ super().__init__() self.reduction = reduction self.multi_gpu = multi_gpu @@ -162,16 +210,18 @@ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: class HingeMulticlassLoss(torch.nn.Module): - """ - Loss to estimate the Hinge loss in a multiclass setup. It computes the - element-wise hinge term. This class use pytorch implementation: - torch.nn.functional.hinge_embedding_loss - """ - def __init__(self, min_margin: float = 1.0, reduction: str = "mean"): - """ + r""" + Loss to estimate the Hinge loss in a multiclass setup. It computes the + element-wise hinge term. Note that this formulation differs from the + one commonly found in tensorflow/pytorch (with maximise the difference + between the two largest logits). This formulation is consistent with the + binary classification loss used in a multiclass fashion. + Args: min_margin: The minimal margin to enforce. + reduction: type of reduction applied to the output. possible values are + 'none' | 'mean' | 'sum' | 'auto'; default is 'mean' ('auto is 'mean') """ super().__init__() self.min_margin = min_margin @@ -183,11 +233,6 @@ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: class HKRMulticlassLoss(torch.nn.Module): - """ - Loss that estimates the Wasserstein-1 distance using the Kantorovich-Rubinstein - duality with a hinge regularization. - """ - def __init__( self, alpha: float, @@ -196,9 +241,30 @@ def __init__( reduction: str = "mean", ): """ + Loss that estimates the Wasserstein-1 distance using the Kantorovich-Rubinstein + duality with a hinge regularization. + + [1] M. Serrurier, F. Mamalet, et al. «Achieving robustness in classification + using optimal transport with hinge regularization», 2021. + + Note that`target` should be one-hot encoded or pre-processed with the + `deel.torchlip.functional.process_labels_for_multi_gpu()` function. + + Using a multi-GPU/TPU strategy requires to set `multi_gpu` to True and to + pre-process the labels `target` with the + `deel.torchlip.functional.process_labels_for_multi_gpu()` function. + + the regularization factor `alpha` is a value between 0 and 1. It controls the + trade-off between the hinge and the KR loss. When `alpha` is 0, the loss is + equivalent to the KR loss, and when `alpha` is 1, the loss is equivalent to the + hinge loss. + Args: alpha: Regularization factor ([0,1]) between the hinge and the KR loss. min_margin: Minimal margin for the hinge loss. + multi_gpu (bool): set to True when running on multi-GPU/TPU + reduction: type of reduction applied to the output. possible values are + 'none' | 'mean' | 'sum' | 'auto'; default is 'mean' ('auto is 'mean') """ super().__init__() if (alpha >= 0) and (alpha <= 1): @@ -233,21 +299,24 @@ def __init__( The multiclass version of HKR with softmax. This is done by computing the HKR term over each class and averaging the results. - Note that `y_true` could be either one-hot encoded, +/-1 values. + [2] M. Serrurier, F. Mamalet, T. Fel et al. "On the explainable properties + of 1-Lipschitz Neural Networks: An Optimal Transport Perspective.", 2024 + + Note that`target` should be one-hot encoded, +/-1 values. + the regularization factor `alpha` is a value between 0 and 1. It controls the + trade-off between the hinge and the KR loss. When `alpha` is 0, the loss is + equivalent to the KR loss, and when `alpha` is 1, the loss is equivalent to the + hinge loss. Args: alpha (float): regularization factor (0 <= alpha <= 1), - 0 for KR only, 1 for hinge only min_margin (float): margin to enforce. + alpha_mean (float): geometric mean factor temperature (float): factor for softmax temperature (higher value increases the weight of the highest non y_true logits) - alpha_mean (float): geometric mean factor - one_hot_ytrue (bool): set to True when y_true are one hot encoded (0 or 1), - and False when y_true already signed bases (for instance +/-1) - reduction: passed to tf.keras.Loss constructor - name (str): passed to tf.keras.Loss constructor - + reduction: type of reduction applied to the output. possible values are + 'none' | 'mean' | 'sum' | 'auto'; default is 'mean' ('auto is 'mean') """ if (alpha >= 0) and (alpha <= 1): self.alpha = torch.tensor(alpha, dtype=torch.float32) @@ -368,7 +437,7 @@ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: class TauCrossEntropyLoss(CrossEntropyLoss): def __init__( self, - tau, + tau: float, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100, @@ -376,6 +445,15 @@ def __init__( reduction: str = "mean", label_smoothing: float = 0.0, ) -> None: + """ + The loss add a temperature (tau) factor to the CrossEntropyLoss + CrossEntropyLoss(tau * input, target) + See `CrossEntropyLoss` for more details on arguments. + + Args: + tau (float): factor for temperature + """ + super().__init__( weight=weight, size_average=size_average, @@ -396,13 +474,21 @@ def forward(self, input: Tensor, target: Tensor) -> Tensor: class TauBCEWithLogitsLoss(BCEWithLogitsLoss): def __init__( self, - tau, + tau: float, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = "mean", pos_weight=None, ) -> None: + """ + The loss add a temperature (tau) factor to the BCEWithLogitsLoss + BCEWithLogitsLoss(tau * input, target) + See `BCEWithLogitsLoss` for more details on arguments. + + Args: + tau (float): factor for temperature + """ super().__init__( weight=weight, size_average=size_average, @@ -426,12 +512,15 @@ def __init__( """ This implementation is sligthly different from the pytorch MultiMarginLoss. - `y_true` and `y_pred` must be of shape (batch_size, # classes). - Note that `y_true` should be one-hot encoded + `target` and `input` must be of shape (batch_size, # classes). + Note that `target` should be one-hot encoded, +/-1 values. + ReLU(min_margin - (input[target>0] - max(input[target<=0]))) + is computed element-wise and averaged over the batch. Args: min_margin (float): margin parameter. - reduction: reduction of the loss, passed to original loss. + reduction: type of reduction applied to the output. possible values are + 'none' | 'mean' | 'sum' | 'auto'; default is 'mean' ('auto is 'mean') """ super().__init__() self.min_margin = min_margin