Skip to content

Commit

Permalink
fix bugs on v100.
Browse files Browse the repository at this point in the history
  • Loading branch information
JamesLim-sy committed Oct 12, 2021
1 parent ed74877 commit b37c2d8
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 4 deletions.
4 changes: 2 additions & 2 deletions paddle/fluid/operators/optimizers/lars_momentum_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ class LarsThreadConfig {
public:
int grid_for_norm;
int grid_for_lars;

#if CUDA_VERSION >= 11000

private:
int grid_stride;

Expand All @@ -55,7 +55,7 @@ class LarsThreadConfig {
return (numel + grid_stride - 1) / grid_stride - 1;
}
#else
const int repeat_times;
int repeat_times;
explicit LarsThreadConfig(const int64_t numel) {
int grid = (numel + LARS_BLOCK_SIZE - 1) / LARS_BLOCK_SIZE;
grid_for_norm = std::min(grid, LARS_BLOCK_SIZE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,10 @@ def layer_warp(block_func, input, ch_in, ch_out, count, stride):
return pool


def train(use_pure_fp16=True, use_nesterov=False, optimizer=""):
def train(use_pure_fp16=True,
use_nesterov=False,
optimizer="",
open_merge_option=False):
classdim = 10
data_shape = [3, 32, 32]
PASS_NUM = 1
Expand Down Expand Up @@ -129,7 +132,8 @@ def train(use_pure_fp16=True, use_nesterov=False, optimizer=""):
optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer(
learning_rate=0.001,
momentum=0.9,
multi_precision=use_pure_fp16)
multi_precision=use_pure_fp16,
merge_option=open_merge_option)
else:
optimizer = paddle.optimizer.Momentum(
learning_rate=0.001,
Expand Down Expand Up @@ -238,10 +242,71 @@ def do_test(use_nesterov=False, optimizer=""):
equal_nan=True),
msg='Failed to test in pure FP16.')

def do_merge_test(optimizer=""):
if optimizer is "Lars":
suffix = "use Lars "
with self.scope_prog_guard():
print("-----------------FP16 Merged Train {}-----------------".
format(suffix))
train_loss_fp16_merge, test_loss_fp16_merge = train(
use_pure_fp16=True,
open_merge_option=True,
optimizer=optimizer)
with self.scope_prog_guard():
print("-----------------FP32 Merged Train {}-----------------".
format(suffix))
train_loss_fp32_merge, test_loss_fp32_merge = train(
use_pure_fp16=False,
open_merge_option=True,
optimizer=optimizer)

with self.scope_prog_guard():
print("-----------------FP32 Validation {}-----------------".
format(suffix))
train_loss_fp32, test_loss_fp32 = train(
use_pure_fp16=False,
open_merge_option=False,
optimizer=optimizer)

self.assertTrue(
np.allclose(
np.array(train_loss_fp16_merge),
np.array(train_loss_fp32),
rtol=1e-02,
atol=1e-05,
equal_nan=True),
msg='Failed to train in merged FP16.')
self.assertTrue(
np.allclose(
np.array(train_loss_fp32_merge),
np.array(train_loss_fp32),
rtol=1e-02,
atol=1e-05,
equal_nan=True),
msg='Failed to train in merged FP32.')

self.assertTrue(
np.allclose(
np.array(test_loss_fp16_merge),
np.array(test_loss_fp32),
rtol=1e-02,
atol=1e-05,
equal_nan=True),
msg='Failed to test in pure FP16.')
self.assertTrue(
np.allclose(
np.array(test_loss_fp32_merge),
np.array(test_loss_fp32),
rtol=1e-02,
atol=1e-05,
equal_nan=True),
msg='Failed to test in pure FP32.')

do_test(use_nesterov=False)
do_test(use_nesterov=True)
do_test(optimizer="Adam")
do_test(optimizer="Lars")
do_merge_test(optimizer="Lars")

@contextlib.contextmanager
def scope_prog_guard(self):
Expand Down

0 comments on commit b37c2d8

Please sign in to comment.