From b37c2d86b637c04ae2198457b085ef2cdc665fff Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Tue, 12 Oct 2021 11:49:29 +0000 Subject: [PATCH] fix bugs on v100. --- .../operators/optimizers/lars_momentum_op.cu | 4 +- .../tests/test_multi_precision_fp16_train.py | 69 ++++++++++++++++++- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index 867a9f209a183..c6de3bdd15dde 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -38,8 +38,8 @@ class LarsThreadConfig { public: int grid_for_norm; int grid_for_lars; - #if CUDA_VERSION >= 11000 + private: int grid_stride; @@ -55,7 +55,7 @@ class LarsThreadConfig { return (numel + grid_stride - 1) / grid_stride - 1; } #else - const int repeat_times; + int repeat_times; explicit LarsThreadConfig(const int64_t numel) { int grid = (numel + LARS_BLOCK_SIZE - 1) / LARS_BLOCK_SIZE; grid_for_norm = std::min(grid, LARS_BLOCK_SIZE); diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py index 92786f2835277..5832a17d70fb8 100644 --- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py +++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py @@ -97,7 +97,10 @@ def layer_warp(block_func, input, ch_in, ch_out, count, stride): return pool -def train(use_pure_fp16=True, use_nesterov=False, optimizer=""): +def train(use_pure_fp16=True, + use_nesterov=False, + optimizer="", + open_merge_option=False): classdim = 10 data_shape = [3, 32, 32] PASS_NUM = 1 @@ -129,7 +132,8 @@ def train(use_pure_fp16=True, use_nesterov=False, optimizer=""): optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer( learning_rate=0.001, momentum=0.9, - multi_precision=use_pure_fp16) + multi_precision=use_pure_fp16, + merge_option=open_merge_option) else: optimizer = paddle.optimizer.Momentum( learning_rate=0.001, @@ -238,10 +242,71 @@ def do_test(use_nesterov=False, optimizer=""): equal_nan=True), msg='Failed to test in pure FP16.') + def do_merge_test(optimizer=""): + if optimizer is "Lars": + suffix = "use Lars " + with self.scope_prog_guard(): + print("-----------------FP16 Merged Train {}-----------------". + format(suffix)) + train_loss_fp16_merge, test_loss_fp16_merge = train( + use_pure_fp16=True, + open_merge_option=True, + optimizer=optimizer) + with self.scope_prog_guard(): + print("-----------------FP32 Merged Train {}-----------------". + format(suffix)) + train_loss_fp32_merge, test_loss_fp32_merge = train( + use_pure_fp16=False, + open_merge_option=True, + optimizer=optimizer) + + with self.scope_prog_guard(): + print("-----------------FP32 Validation {}-----------------". + format(suffix)) + train_loss_fp32, test_loss_fp32 = train( + use_pure_fp16=False, + open_merge_option=False, + optimizer=optimizer) + + self.assertTrue( + np.allclose( + np.array(train_loss_fp16_merge), + np.array(train_loss_fp32), + rtol=1e-02, + atol=1e-05, + equal_nan=True), + msg='Failed to train in merged FP16.') + self.assertTrue( + np.allclose( + np.array(train_loss_fp32_merge), + np.array(train_loss_fp32), + rtol=1e-02, + atol=1e-05, + equal_nan=True), + msg='Failed to train in merged FP32.') + + self.assertTrue( + np.allclose( + np.array(test_loss_fp16_merge), + np.array(test_loss_fp32), + rtol=1e-02, + atol=1e-05, + equal_nan=True), + msg='Failed to test in pure FP16.') + self.assertTrue( + np.allclose( + np.array(test_loss_fp32_merge), + np.array(test_loss_fp32), + rtol=1e-02, + atol=1e-05, + equal_nan=True), + msg='Failed to test in pure FP32.') + do_test(use_nesterov=False) do_test(use_nesterov=True) do_test(optimizer="Adam") do_test(optimizer="Lars") + do_merge_test(optimizer="Lars") @contextlib.contextmanager def scope_prog_guard(self):