From 12d076f7476d949ac614cd30235dfa17c62f7151 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Wed, 25 Aug 2021 15:44:29 +0100 Subject: [PATCH] [docs] Add Mixed Precision detailed docs (#9104) --- docs/source/advanced/mixed_precision.rst | 83 ++++++++++++++++++++++++ docs/source/conf.py | 2 + docs/source/guides/speed.rst | 4 +- docs/source/index.rst | 1 + 4 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 docs/source/advanced/mixed_precision.rst diff --git a/docs/source/advanced/mixed_precision.rst b/docs/source/advanced/mixed_precision.rst new file mode 100644 index 0000000000000..ea784f0894a00 --- /dev/null +++ b/docs/source/advanced/mixed_precision.rst @@ -0,0 +1,83 @@ +.. testsetup:: * + + from pytorch_lightning import Trainer + + +.. _amp: + +Mixed Precision Training +======================== + +Mixed precision combines the use of both FP32 and lower bit floating points (such as FP16) to reduce memory footprint during model training, resulting in improved performance. + +Lightning offers mixed precision training for GPUs and CPUs, as well as bfloat16 mixed precision training for TPUs. + +.. note:: + + In some cases it is important to remain in FP32 for numerical stability, so keep this in mind when using mixed precision. + + For example when running scatter operations during the forward (such as torchpoint3d) computation must remain in FP32. + +FP16 Mixed Precision +-------------------- + +In most cases, mixed precision uses FP16. Supported torch operations are automatically run in FP16, saving memory and improving throughput on GPU and TPU accelerators. + +Since computation happens in FP16, there is a chance of numerical instability. This is handled internally by a dynamic grad scaler which skips steps that are invalid, and adjusts the scaler to ensure subsequent steps fall within a finite range. For more information `see the autocast docs `__. + +.. note:: + + When using TPUs, setting ``precision=16`` will enable bfloat16 which is the only supported precision type on TPUs. + +.. testcode:: + :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE or not torch.cuda.is_available() + + Trainer(gpus=1, precision=16) + +BFloat16 Mixed Precision +------------------------ + +.. warning:: + + BFloat16 requires PyTorch 1.10 or later. Currently this requires installing `PyTorch Nightly `__. + + BFloat16 is also experimental and may not provide large speedups or memory improvements, but offer better numerical stability. + + Do note for GPUs, largest benefits require `Ampere `__ based GPUs, such as A100s or 3090s. + +BFloat16 Mixed precision is similar to FP16 mixed precision, however we maintain more of the "dynamic range" that FP32 has to offer. This means we are able to improve numerical stability, compared to FP16 mixed precision. For more information see `this TPU performance blog post `__. + +Since BFloat16 is more stable than FP16 during training, we do not need to worry about any gradient scaling or nan gradient values that comes with using FP16 mixed precision. + +.. testcode:: + :skipif: not _TORCH_BFLOAT_AVAILABLE + + Trainer(gpus=1, precision="bf16") + +It is also possible to use BFloat16 mixed precision on the CPU, relying on MKLDNN under the hood. + +.. testcode:: + :skipif: not _TORCH_CPU_AMP_AVAILABLE + + Trainer(precision="bf16") + +NVIDIA APEX Mixed Precision +--------------------------- + +.. warning:: + + We strongly recommend to use the above native mixed precision rather than NVIDIA APEX unless you require more finer control. + +`NVIDIA APEX `__ offers some additional flexibility in setting mixed precision. This can be useful for when wanting to try out different precision configurations, such as keeping most of your weights in FP16 as well as running computation in FP16. + +.. testcode:: + :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE or not torch.cuda.is_available() + + Trainer(gpus=1, amp_backend="apex") + +Set the `NVIDIA optimization level `__ via the trainer. + +.. testcode:: + :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE or not torch.cuda.is_available() + + Trainer(gpus=1, amp_backend="apex", amp_level="O2") diff --git a/docs/source/conf.py b/docs/source/conf.py index 8ddc896b6e912..4adbacd4cf60c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -370,6 +370,8 @@ def package_list_from_file(file): _XLA_AVAILABLE, _TPU_AVAILABLE, _TORCHVISION_AVAILABLE, + _TORCH_BFLOAT_AVAILABLE, + _TORCH_CPU_AMP_AVAILABLE, _module_available, ) _JSONARGPARSE_AVAILABLE = _module_available("jsonargparse") diff --git a/docs/source/guides/speed.rst b/docs/source/guides/speed.rst index 4e3ed0b1de801..fd245e741b9aa 100644 --- a/docs/source/guides/speed.rst +++ b/docs/source/guides/speed.rst @@ -186,7 +186,7 @@ Read more in our :ref:`accelerators` and :ref:`plugins` guides. ----------- -.. _amp: +.. _speed_amp: ********************************* Mixed precision (16-bit) training @@ -210,7 +210,7 @@ Mixed precision (16-bit) training Mixed precision combines the use of both 32 and 16 bit floating points to reduce memory footprint during model training, resulting in improved performance, achieving +3X speedups on modern GPUs. -Lightning offers mixed precision or 16-bit training for GPUs and TPUs. +Lightning offers mixed precision training for GPUs and CPUs, as well as bfloat16 mixed precision training for TPUs. .. testcode:: diff --git a/docs/source/index.rst b/docs/source/index.rst index f3c154a7d257b..e1de1ed30defa 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -54,6 +54,7 @@ PyTorch Lightning Documentation common/loggers advanced/multi_gpu advanced/advanced_gpu + advanced/mixed_precision common/weights_loading advanced/checkpoint_io common/optimizers