Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update hapi to support AMP #31417

Merged
merged 23 commits into from
Apr 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
fbd9374
make hapi support amp, and add unittest
LiuChiachi Mar 3, 2021
15e1f1b
make unittest only support GPU
LiuChiachi Mar 4, 2021
3e596c0
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
LiuChiachi Mar 16, 2021
9a04619
update parameters for amp in hapi.Model
LiuChiachi Mar 16, 2021
85277bb
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
LiuChiachi Mar 16, 2021
f21ad97
update hapi.Model.prepare interface, and update unittest
LiuChiachi Mar 25, 2021
6724f68
fix test_model.py unittest bug
LiuChiachi Mar 25, 2021
8ded59d
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
LiuChiachi Mar 25, 2021
f4859ac
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
LiuChiachi Mar 31, 2021
ea476c7
add grad clear in dygraph
LiuChiachi Mar 31, 2021
e1eee29
use_fp16_guard defaults to True, which could avoid nan
LiuChiachi Apr 8, 2021
ec09e24
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
LiuChiachi Apr 8, 2021
94d6562
add input check, and add internal doc link to low level api
LiuChiachi Apr 10, 2021
4f41806
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
LiuChiachi Apr 10, 2021
bdffa79
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
LiuChiachi Apr 12, 2021
ed29314
update doc, and decrease the sample num of dataset to avoid timeout
LiuChiachi Apr 12, 2021
12aedea
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
LiuChiachi Apr 12, 2021
d950b9f
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
LiuChiachi Apr 12, 2021
d779d22
make hapi amp param support str 'O1' or 'O2'
LiuChiachi Apr 12, 2021
6d2bded
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
LiuChiachi Apr 13, 2021
614d3a0
resume calling , modify the code of the check part
LiuChiachi Apr 13, 2021
fa57be7
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
LiuChiachi Apr 14, 2021
e0a4d62
upgrade the usage of Fleet API, and disable 'pure_fp16' param
LiuChiachi Apr 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/paddle/hapi/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def on_train_begin(self, logs=None):
}
if self._is_print():
print(
"The loss value printed in the log is the current step, and the metric is the average value of previous step."
"The loss value printed in the log is the current step, and the metric is the average value of previous steps."
)

def on_epoch_begin(self, epoch=None, logs=None):
Expand Down
232 changes: 213 additions & 19 deletions python/paddle/hapi/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,15 @@
from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
from paddle.fluid.layers.utils import flatten
from paddle.fluid.layers import collective
from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
from paddle.fluid.incubate.fleet.base import role_maker

from paddle.io import DataLoader, Dataset, DistributedBatchSampler
from paddle.fluid.executor import scope_guard, Executor
from paddle.fluid.dygraph.layers import Layer
from paddle.metric import Metric
from paddle.static import InputSpec as Input
import paddle.distributed as dist
import paddle.distributed.fleet as fleet
from paddle.distributed.fleet.base import role_maker

from .callbacks import config_callbacks, EarlyStopping
from .model_summary import summary
Expand Down Expand Up @@ -252,6 +252,11 @@ def __init__(self, model):
self._nranks = ParallelEnv().nranks
self._local_rank = ParallelEnv().local_rank

self._amp_level = "O0"
self._amp_configs = {}
self._amp_custom_lists = {}
self._use_fp16_guard = True

@property
def mode(self):
return self.model.mode
Expand Down Expand Up @@ -550,11 +555,26 @@ def _make_program(self, mode):
if self._nranks > 1:
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
dist_strategy = DistributedStrategy()
dist_strategy.mode = "collective"
dist_strategy.collective_mode = "grad_allreduce"
dist_strategy = fleet.DistributedStrategy()
if self._amp_level != 'O0':
dist_strategy.amp = True
dist_strategy.amp_configs = self._amp_configs.copy()
dist_strategy.amp_configs.update(self._amp_custom_lists)
dist_strategy.amp_configs[
'use_pure_fp16'] = self._amp_level == 'O2'
self.model._optimizer = fleet.distributed_optimizer(
self.model._optimizer, strategy=dist_strategy)
elif self._amp_level != "O0" and core.is_compiled_with_cuda:
amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
**self.
_amp_custom_lists) if self._amp_custom_lists else None

self.model._optimizer = paddle.static.amp.decorate(
self.model._optimizer,
amp_lists=amp_lists,
use_pure_fp16=self._amp_level == "O2",
use_fp16_guard=self._use_fp16_guard,
**self._amp_configs)

self.model._optimizer.minimize(self._loss_endpoint)

Expand Down Expand Up @@ -598,6 +618,10 @@ def _compile_and_initialize(self, prog, mode):
startup_prog = self._startup_prog._prune(uninitialized)
self._executor.run(startup_prog)

if self._amp_level == "O2" and mode == 'train' and core.is_compiled_with_cuda(
):
self.model._optimizer.amp_init(place)

if self._nranks < 2:
compiled_prog = fluid.CompiledProgram(prog)
else:
Expand All @@ -620,6 +644,11 @@ def __init__(self, model):
}

self._input_info = None
self._amp_level = "O0"
self._amp_configs = {}
self._amp_custom_lists = {}
self._use_fp16_guard = True

if self._nranks > 1:
dist.init_parallel_env()
stradegy = fluid.dygraph.parallel.ParallelStrategy()
Expand Down Expand Up @@ -649,19 +678,30 @@ def train_batch(self, inputs, labels=None):
labels = labels or []
labels = [to_variable(l) for l in to_list(labels)]

if self._nranks > 1:
outputs = self.ddp_model.forward(* [to_variable(x) for x in inputs])
else:
outputs = self.model.network.forward(
* [to_variable(x) for x in inputs])
if self._amp_level != "O0":
scaler = paddle.amp.GradScaler(**self._amp_configs)
with paddle.amp.auto_cast(
enable=self._amp_level != 'O0', **self._amp_custom_lists):
if self._nranks > 1:
outputs = self.ddp_model.forward(
* [to_variable(x) for x in inputs])
else:
outputs = self.model.network.forward(
* [to_variable(x) for x in inputs])

losses = self.model._loss(*(to_list(outputs) + labels))
losses = to_list(losses)
final_loss = fluid.layers.sum(losses)
final_loss.backward()
losses = self.model._loss(*(to_list(outputs) + labels))
losses = to_list(losses)
final_loss = fluid.layers.sum(losses)

self.model._optimizer.minimize(final_loss)
self.model.network.clear_gradients()
if self._amp_level != "O0":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dygraph does not support 02 yet?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When O2 is chosen in dygraph, it would be treated as O1, and users would be warned. Is this correct?

scaled = scaler.scale(final_loss)
scaled.backward()
scaler.minimize(self.model._optimizer, scaled)
self.model.network.clear_gradients()
else:
final_loss.backward()
self.model._optimizer.minimize(final_loss)
self.model.network.clear_gradients()

metrics = []
for metric in self.model._metrics:
Expand Down Expand Up @@ -816,6 +856,16 @@ class Model(object):
instantiating a Model. The input description, i.e, paddle.static.InputSpec,
must be required for static graph.

When training on GPU, auto mixed precision (AMP) training is supported, and
pure float16 training is also supported in static mode while using Adam,
AdamW and Momentum optimizer. Before using pure float16 training,
`multi_precision` could be set to True when creating optimizer, which can
avoid poor accuracy or slow convergence in a way, and inputs of dtype float
should be cast to float16 by users. Users should also use
`paddle.static.amp.fp16_guard` API to limit the range of pure float16
training, otherwise, 'use_fp16_guard' should be set to False by users.
However, limiting the range of is not supported during training using AMP.

Args:
network (paddle.nn.Layer): The network is an instance of
paddle.nn.Layer.
Expand All @@ -830,6 +880,8 @@ class Model(object):


Examples:
1. A common example

.. code-block:: python

import paddle
Expand All @@ -838,7 +890,7 @@ class Model(object):
from paddle.static import InputSpec

device = paddle.set_device('cpu') # or 'gpu'

net = nn.Sequential(
nn.Flatten(1),
nn.Linear(784, 200),
Expand All @@ -852,6 +904,7 @@ class Model(object):
model = paddle.Model(net, input, label)
optim = paddle.optimizer.SGD(learning_rate=1e-3,
parameters=model.parameters())

model.prepare(optim,
paddle.nn.CrossEntropyLoss(),
paddle.metric.Accuracy())
Expand All @@ -862,6 +915,43 @@ class Model(object):
])
data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
model.fit(data, epochs=2, batch_size=32, verbose=1)


2. An example using mixed precision training.

.. code-block:: python

import paddle
import paddle.nn as nn
import paddle.vision.transforms as T

def run_example_code():
device = paddle.set_device('gpu')

net = nn.Sequential(nn.Flatten(1), nn.Linear(784, 200), nn.Tanh(),
nn.Linear(200, 10))

model = paddle.Model(net)
optim = paddle.optimizer.SGD(learning_rate=1e-3, parameters=model.parameters())

amp_configs = {
"level": "O1",
"custom_white_list": {'conv2d'},
"use_dynamic_loss_scaling": True
}
model.prepare(optim,
paddle.nn.CrossEntropyLoss(),
paddle.metric.Accuracy(),
amp_configs=amp_configs)

transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
model.fit(data, epochs=2, batch_size=32, verbose=1)

# mixed precision training is only support on GPU now.
if paddle.is_compiled_with_cuda():
run_example_code()

"""

def __init__(self, network, inputs=None, labels=None):
Expand Down Expand Up @@ -1241,7 +1331,94 @@ def parameters(self, *args, **kwargs):
"""
return self._adapter.parameters()

def prepare(self, optimizer=None, loss=None, metrics=None):
def _prepare_amp(self, amp_configs):
def _check_pure_fp16_configs():
# pure float16 training has some restricts now
if self._adapter._amp_level == "O2":
if in_dygraph_mode():
warnings.warn("Pure float16 training is not supported in dygraph mode now, "\
"and it will be supported in future version.")
else:
# grad clip is not supported in pure fp16 training now
assert self._optimizer._grad_clip is None, \
"Grad clip is not supported in pure float16 training now, and it will be supported in future version."

self._adapter._amp_custom_lists = {}
self._adapter._amp_configs = {}

# check and get level of mixed precision training
if not amp_configs:
self._adapter._amp_level = 'O0'
return
elif isinstance(amp_configs, str):
if amp_configs not in ('O0', 'O1', 'O2'):
raise ValueError(
"The level of amp_configs should be 'O0', 'O1' or 'O2'.")
self._adapter._amp_level = amp_configs
_check_pure_fp16_configs()
return
else:
if 'level' not in amp_configs:
self._adapter._amp_level = 'O1'
elif amp_configs['level'] not in ('O0', 'O1', 'O2'):
raise ValueError(
"amp_configs['level'] should be 'O0', 'O1' or 'O2'.")
else:
self._adapter._amp_level = amp_configs['level']
amp_config_key_set = set(amp_configs.keys()) - {'level'}
if not amp_config_key_set or self._adapter._amp_level == 'O0':
return

if 'use_pure_fp16' in amp_configs:
raise ValueError(
"''use_pure_fp16' is an invalid parameter, "
"the level of mixed precision training only depends on 'O1' or 'O2'."
)

_check_pure_fp16_configs()

# construct amp_custom_lists
if self._adapter._amp_level != 'O0' and amp_config_key_set:
for param_name in [
'custom_white_list', 'custom_black_list',
'custom_black_varnames'
]:
if param_name in amp_config_key_set:
self._adapter._amp_custom_lists[param_name] = amp_configs[
param_name]
amp_config_key_set -= {param_name}

def _check_amp_configs(amp_config_key_set):
accepted_param_set = {
'init_loss_scaling',
'incr_ratio',
'decr_ratio',
'incr_every_n_steps',
'decr_every_n_nan_or_inf',
'use_dynamic_loss_scaling',
'use_fp16_guard',
}
if amp_config_key_set - accepted_param_set:
raise ValueError(
"Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, "
"but {} could not be recognized.".format(
tuple(amp_config_key_set - accepted_param_set)))

if 'use_fp16_guard' in amp_config_key_set:
if in_dygraph_mode():
raise ValueError(
"'use_fp16_guard' is supported in static mode only.")
self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
amp_config_key_set.remove('use_fp16_guard')

return amp_config_key_set

amp_configs_set = _check_amp_configs(amp_config_key_set)
for key in amp_configs_set:
self._adapter._amp_configs[key] = amp_configs[key]

def prepare(self, optimizer=None, loss=None, metrics=None,
amp_configs=None):
"""
Configures the model before runing.

Expand All @@ -1255,7 +1432,23 @@ def prepare(self, optimizer=None, loss=None, metrics=None):
It can be None when there is no loss.
metrics (Metric|list of Metric|None): If metrics is set, all
metrics will be calculated and output in train/eval mode.

amp_configs (str|dict|None): AMP configurations. If AMP or pure
float16 training is used, the key 'level' of 'amp_configs'
should be set to 'O1' or 'O2' respectively. Otherwise, the
value of 'level' defaults to 'O0', which means float32
training. In addition to 'level', users could pass in more
parameters consistent with mixed precision API. The supported
keys are: 'init_loss_scaling', 'incr_ratio', 'decr_ratio',
'incr_every_n_steps', 'decr_every_n_nan_or_inf',
'use_dynamic_loss_scaling', 'custom_white_list',
'custom_black_list', and 'custom_black_varnames'or
'use_fp16_guard' is only supported in static mode. Users could
refer to mixed precision API documentations
:ref:`api_paddle_amp_auto_cast` and
:ref:`api_paddle_amp_GradScaler` for details. For convenience,
'amp_configs' could be set to 'O1' or 'O2' if no more
parameters are needed. 'amp_configs' could be None in float32
training. Default: None.
Returns:
None
"""
Expand Down Expand Up @@ -1292,6 +1485,7 @@ def prepare(self, optimizer=None, loss=None, metrics=None):
"{} is not sub class of Metric".format(
metric.__class__.__name__)
self._metrics = to_list(metrics)
self._prepare_amp(amp_configs)

if not in_dygraph_mode():
self._adapter.prepare()
Expand Down
Loading