add clip by value

Lightning-AI · williamFalcon · Apr 6, 2021 · Feb 22, 2021 · Feb 22, 2021 · Feb 22, 2021
commit c0e80642609e68e3408a52109b10dc231850126a
@@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ## [UnReleased] - 2021-MM-DD
 
 ### Added
-Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#6121](https://github.com/PyTorchLightning/pytorch-lightning/pull/6121)).
+Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#6123](https://github.com/PyTorchLightning/pytorch-lightning/pull/6123)).
 
 ### Changed
 

diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
@@ -115,6 +115,28 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None):
     )
 
 
+@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(
+    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
+)
+@DDPLauncher.run("--accelerator ddp --gpus 2  --precision 16")
+def test_ddp_sharded_plugin_clip_gradients(tmpdir, args=None):
+    plugin_parity_test(
+        gpus=args.gpus,
+        precision=args.precision,
+        model_cls=SeedTrainLoaderModel,
+        gradient_clip_val=0.001,
+    )
+    plugin_parity_test(
+        gpus=args.gpus,
+        precision=args.precision,
+        model_cls=SeedTrainLoaderModel,
+        gradient_clip_val=0.001,
+        gradient_clip_algorithm='value',
+    )
+
+
 @pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
@@ -245,6 +267,8 @@ def plugin_parity_test(
     gpus: int = 0,
     precision: int = 32,
     max_percent_speed_diff: float = 0.1,
+    gradient_clip_val: float = 0,
+    gradient_clip_algorithm: str = 'norm',
 ):
     """
     Ensures that the trained model is identical to the standard DDP implementation.

@@ -26,8 +26,10 @@ The effect is a large effective batch size of size KxN.
 
 Gradient Clipping
 -----------------
-Gradient clipping may be enabled to avoid exploding gradients. Specifically, this will `clip the gradient
-norm <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_norm_>`_ computed over all model parameters together.
+Gradient clipping may be enabled to avoid exploding gradients. By default, this will `clip the gradient norm
+<https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_norm_>`_ computed over all model parameters together.
+If gradient_clip_algorithm option is set to 'value', which is 'norm' by default, this will
+`clip the gradient value <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_value_>`_ for each parameter instead.
 
 .. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer`
 
@@ -39,6 +41,8 @@ norm <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_norm_>`_
     # clip gradients with norm above 0.5
     trainer = Trainer(gradient_clip_val=0.5)
 
+    # clip gradients with value above 0.5
+    trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='value')
 ----------
 
 Stochastic Weight Averaging

diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst
@@ -735,6 +735,19 @@ Gradient clipping value
     # default used by the Trainer
     trainer = Trainer(gradient_clip_val=0.0)
 
+gradient_clip_val
+^^^^^^^^^^^^^^^^^
+
+Gradient clipping algorithm
+
+- Clip gradients by norm or value.
+
+.. testcode::
+
+    # default used by the Trainer
+    trainer = Trainer(gradient_clip_algorithm='norm')
+
+
 limit_train_batches
 ^^^^^^^^^^^^^^^^^^^
 

@@ -22,7 +22,7 @@
 from pytorch_lightning.plugins.training_type import TrainingTypePlugin
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available
-from pytorch_lightning.utilities.enums import AMPType, LightningEnum
+from pytorch_lightning.utilities.enums import AMPType, GradClipAlgorithmType, LightningEnum
 
 
 class Accelerator(object):
@@ -287,10 +287,15 @@ def optimizer_zero_grad(self, current_epoch: int, batch_idx: int, optimizer: Opt
         model_ref = self.lightning_module
         model_ref.optimizer_zero_grad(current_epoch, batch_idx, optimizer, opt_idx)
 
-    def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float]) -> None:
+    def clip_gradients(
+            self,
+            optimizer: Optimizer,
+            clip_val: Union[int, float],
+            gradient_clip_algorithm: str = GradClipAlgorithmType.NORM,
+    ) -> None:
         """clips all the optimizer parameters to the given value"""
 
-        self.precision_plugin.clip_gradients(optimizer, clip_val)
+        self.precision_plugin.clip_gradients(optimizer, clip_val, gradient_clip_algorithm)
 
     def on_train_epoch_end(self, outputs) -> None:
         """Hook to do something on the end of an training epoch

@@ -5,6 +5,7 @@
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
+from pytorch_lightning.utilities import GradClipAlgorithmType
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.warnings import WarningCache
 
@@ -54,7 +55,13 @@ def backward(
 
         return closure_loss
 
-    def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)):
+    def clip_gradients(
+        self,
+        optimizer: Optimizer,
+        clip_val: Union[int, float],
+        gradient_clip_algorithm: str = GradClipAlgorithmType.NORM,
+        norm_type: float = float(2.0),
+    ):
         """
         DeepSpeed handles clipping gradients via the training type plugin.
         """

@@ -20,6 +20,7 @@
 
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.plugins.base_plugin import Plugin
+from pytorch_lightning.utilities import GradClipAlgorithmType
 
 
 class PrecisionPlugin(Plugin):
@@ -86,7 +87,13 @@ def pre_optimizer_step(
     def post_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int) -> None:
         """Hook to do something after each optimizer step."""
 
-    def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)) -> None:
+    def clip_gradients(
+        self,
+        optimizer: Optimizer,
+        clip_val: Union[int, float],
+        gradient_clip_algorithm: str = GradClipAlgorithmType.NORM,
+        norm_type: float = float(2.0),
+    ) -> None:
         """Clips the gradients to a specific value"""
         # TODO: separate TPU case from here
         if clip_val is None:
@@ -98,26 +105,28 @@ def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float], norm
             return
 
         parameters = list(self.master_params(optimizer))
-
-        max_norm = grad_clip_val
-
-        if isinstance(parameters, torch.Tensor):
-            parameters = [parameters]
-        parameters = list(filter(lambda p: p.grad is not None, parameters))
-
-        device = parameters[0].device
-
-        if norm_type == math.inf:
-            total_norm = max(p.grad.data.abs().max() for p in parameters)
-        else:
-            out = torch.empty(len(parameters), device=device)
-            for i, p in enumerate(parameters):
-                torch.norm(p.grad.data.to(device), norm_type, out=out[i])
-            total_norm = torch.norm(out, norm_type)
-
-        eps = self.EPSILON
-
-        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)
-        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
-        for p in parameters:
-            p.grad.data.mul_(clip_coef.to(p.grad.data.device))
+        if gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
+            torch.nn.utils.clip_grad_value_(parameters, clip_value=grad_clip_val)
+        elif gradient_clip_algorithm == GradClipAlgorithmType.NORM:
+            max_norm = grad_clip_val
+
+            if isinstance(parameters, torch.Tensor):
+                parameters = [parameters]
+            parameters = list(filter(lambda p: p.grad is not None, parameters))
+
+            device = parameters[0].device
+
+            if norm_type == math.inf:
+                total_norm = max(p.grad.data.abs().max() for p in parameters)
+            else:
+                out = torch.empty(len(parameters), device=device)
+                for i, p in enumerate(parameters):
+                    torch.norm(p.grad.data.to(device), norm_type, out=out[i])
+                total_norm = torch.norm(out, norm_type)
+
+            eps = self.EPSILON
+
+            clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)
+            clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
+            for p in parameters:
+                p.grad.data.mul_(clip_coef.to(p.grad.data.device))
@@ -13,10 +13,11 @@
 # limitations under the License.
 from typing import cast, Union
 
+import torch
 from torch.optim import Optimizer
 
 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
-from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
+from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE, GradClipAlgorithmType
 
 if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE:
     from fairscale.optim import OSS
@@ -31,6 +32,18 @@ def __init__(self):
         super().__init__()
         self.scaler = ShardedGradScaler()
 
-    def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)):
-        optimizer = cast(OSS, optimizer)
-        optimizer.clip_grad_norm(clip_val, norm_type=norm_type)
+    def clip_gradients(
+        self,
+        optimizer: Optimizer,
+        clip_val: Union[int, float],
+        gradient_clip_algorithm: str = GradClipAlgorithmType.NORM,
+        norm_type: float = float(2.0),
+    ):
+        if gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
+            parameters = list(self.master_params(optimizer))
+            if isinstance(parameters, torch.Tensor):
+                parameters = [parameters]
+            torch.nn.utils.clip_grad_value_(parameters, clip_value=clip_val)
+        elif gradient_clip_algorithm == GradClipAlgorithmType.NORM:
+            optimizer = cast(OSS, optimizer)
+            optimizer.clip_grad_norm(clip_val, norm_type=norm_type)
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning.callbacks import GradientAccumulationScheduler
+from pytorch_lightning.utilities import GradClipAlgorithmType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -23,6 +24,7 @@ def __init__(self, trainer):
     def on_trainer_init(
         self,
         gradient_clip_val,
+        gradient_clip_algorithm,
         track_grad_norm,
         accumulate_grad_batches,
         truncated_bptt_steps,
@@ -32,7 +34,11 @@ def on_trainer_init(
         self.trainer.terminate_on_nan = terminate_on_nan
 
         # gradient clipping
+        if gradient_clip_algorithm not in [GradClipAlgorithmType.VALUE, GradClipAlgorithmType.NORM]:
+            raise MisconfigurationException(f"gradient_clip_algorithm should be "
+                                            f"'{GradClipAlgorithmType.VALUE}' or '{GradClipAlgorithmType.NORM}'")
         self.trainer.gradient_clip_val = gradient_clip_val
+        self.trainer.gradient_clip_algorithm = gradient_clip_algorithm
 
         # gradient norm tracking
         if not isinstance(track_grad_norm, (int, float)) and track_grad_norm != 'inf':

@@ -90,6 +90,7 @@ def __init__(
         callbacks: Optional[Union[List[Callback], Callback]] = None,
         default_root_dir: Optional[str] = None,
         gradient_clip_val: float = 0,
+        gradient_clip_algorithm: str = 'norm',
         process_position: int = 0,
         num_nodes: int = 1,
         num_processes: int = 1,
@@ -201,6 +202,8 @@ def __init__(
 
             gradient_clip_val: 0 means don't clip.
 
+            gradient_clip_algorithm: 'value' means clip_by_value, 'norm' means clip_by_norm. Defualt: 'norm'
-            gradient_clip_algorithm: 'value' means clip_by_value, 'norm' means clip_by_norm. Defualt: 'norm'
+            gradient_clip_algorithm: 'value' means clip_by_value, 'norm' means clip_by_norm. Default: 'norm'
-            gradient_clip_algorithm: 'value' means clip_by_value, 'norm' means clip_by_norm. Defualt: 'norm'
+            gradient_clip_algorithm: 'value' means clip_by_value, 'norm' means clip_by_norm. Default: 'norm'
+
             limit_train_batches: How much of training dataset to check (floats = percent, int = num_batches)
 
             limit_val_batches: How much of validation dataset to check (floats = percent, int = num_batches)
@@ -355,7 +358,12 @@ def __init__(
 
         # init training tricks
         self.training_tricks_connector.on_trainer_init(
-            gradient_clip_val, track_grad_norm, accumulate_grad_batches, truncated_bptt_steps, terminate_on_nan
+            gradient_clip_val,
+            gradient_clip_algorithm,
+            track_grad_norm,
+            accumulate_grad_batches,
+            truncated_bptt_steps,
+            terminate_on_nan,
         )
 
         # init train loop related flags

@@ -22,7 +22,13 @@
     rank_zero_only,
     rank_zero_warn,
 )
-from pytorch_lightning.utilities.enums import AMPType, DeviceType, DistributedType, LightningEnum  # noqa: F401
+from pytorch_lightning.utilities.enums import (  # noqa: F401
+    AMPType,
+    DeviceType,
+    DistributedType,
+    GradClipAlgorithmType,
+    LightningEnum,
+)
 from pytorch_lightning.utilities.imports import (  # noqa: F401
     _APEX_AVAILABLE,
     _BOLTS_AVAILABLE,

@@ -84,3 +84,12 @@ class DeviceType(LightningEnum):
     CPU = 'CPU'
     GPU = 'GPU'
     TPU = 'TPU'
+
+
+class GradClipAlgorithmType(LightningEnum):
+    """ Define gradient_clip_algorithm types - training-tricks.
+    >>> GradClipAlgorithmType.VALUE in  ('value', 'norm')
+    True
+    """
+    VALUE = 'value'
+    NORM = 'norm'
@@ -17,6 +17,7 @@
 import shlex
 import subprocess
 import sys
+from copy import deepcopy
 
 import numpy as np
 import pytest
@@ -66,6 +67,13 @@ def _run_horovod(trainer_options, on_gpu=False):
     assert exit_code == 0
 
 
+def _run_horovod_clip_grad_by_value(trainer_options, on_gpu=False):
+    # clip_grad_by_value test
+    trainer_options_clip_grad_val = deepcopy(trainer_options)
+    trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'})
+    _run_horovod(trainer_options_clip_grad_val, on_gpu)
+
+
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
 def test_horovod_cpu(tmpdir):
     """Test Horovod running multi-process on CPU."""
@@ -81,6 +89,7 @@ def test_horovod_cpu(tmpdir):
         deterministic=True,
     )
     _run_horovod(trainer_options)
+    _run_horovod_clip_grad_by_value(trainer_options)
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@@ -97,6 +106,7 @@ def test_horovod_cpu_implicit(tmpdir):
         deterministic=True,
     )
     _run_horovod(trainer_options)
+    _run_horovod_clip_grad_by_value(trainer_options)
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@@ -117,6 +127,7 @@ def test_horovod_multi_gpu(tmpdir):
         accelerator='horovod',
     )
     _run_horovod(trainer_options, on_gpu=True)
+    _run_horovod_clip_grad_by_value(trainer_options, on_gpu=True)
 
 
 @pytest.mark.skip(reason="Horovod has a problem with broadcast when using apex?")
@@ -141,6 +152,7 @@ def test_horovod_apex(tmpdir):
         precision=16,
     )
     _run_horovod(trainer_options, on_gpu=True)
+    _run_horovod_clip_grad_by_value(trainer_options, on_gpu=True)
 
 
 @pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp")
@@ -165,6 +177,7 @@ def test_horovod_amp(tmpdir):
         precision=16,
     )
     _run_horovod(trainer_options, on_gpu=True)
+    _run_horovod_clip_grad_by_value(trainer_options, on_gpu=True)
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")