From 81111d9bf9778fa3368627fbdb5221edda676559 Mon Sep 17 00:00:00 2001 From: Anirudh Acharya Date: Sat, 31 Aug 2019 01:00:23 +0000 Subject: [PATCH 1/3] fix update rules --- src/operator/optimizer_op-inl.h | 41 +++++++++---------------- tests/python/unittest/test_optimizer.py | 16 +++++----- 2 files changed, 22 insertions(+), 35 deletions(-) diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 3c03f8061d87..af412b7fa667 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -1066,21 +1066,14 @@ struct NAGMomKernel { const DType param_lr, const DType param_wd, const DType param_rescale_grad, const OpReqType req) { if (param_clip_gradient >= 0.0f) { - mom_data[i] = param_momentum*mom_data[i] - + mshadow_op::clip::Map(param_rescale_grad*grad_data[i], - param_clip_gradient) - + (param_wd*weight_data[i]); - KERNEL_ASSIGN(out_data[i], req, weight_data[i] - - param_lr*(param_momentum*mom_data[i] - + mshadow_op::clip::Map(param_rescale_grad*grad_data[i], - param_clip_gradient))); + mom_data[i] = param_momentum*mom_data[i]; + KERNEL_ASSIGN(out_data[i], req, weight_data[i]-mom_data[i]+(param_momentum+1) + *(mom_data[i]-(param_lr*(mshadow_op::clip::Map(param_rescale_grad + *grad_data[i],param_clip_gradient)+(param_wd*weight_data[i]))))); } else { - mom_data[i] = param_momentum*mom_data[i] - + param_rescale_grad*grad_data[i] - + (param_wd*weight_data[i]); - KERNEL_ASSIGN(out_data[i], req, weight_data[i] - - param_lr*(param_momentum*mom_data[i] - + param_rescale_grad*grad_data[i])); + mom_data[i] = param_momentum*mom_data[i]; + KERNEL_ASSIGN(out_data[i], req, weight_data[i]-mom_data[i]+(param_momentum+1) + *(mom_data[i]-(param_lr*(param_rescale_grad*grad_data[i]+param_wd*weight_data[i])))); } } }; @@ -1119,22 +1112,16 @@ struct MP_NAGMomKernel { const OpReqType req) { float w = weight32[i]; if (param_clip_gradient >= 0.0f) { - mom_data[i] = param_momentum*mom_data[i] - + mshadow_op::clip::Map(param_rescale_grad - *static_cast(grad_data[i]), param_clip_gradient) - + (param_wd*w); - w = w - param_lr*(param_momentum*mom_data[i] - + mshadow_op::clip::Map(param_rescale_grad - *static_cast(grad_data[i]), - param_clip_gradient)); + mom_data[i] = param_momentum*mom_data[i]; + w = w-mom_data[i]+(param_momentum+1)*(mom_data[i]-param_lr + *(mshadow_op::clip::Map(param_rescale_grad*static_cast(grad_data[i]), + param_clip_gradient)+(param_wd*w))); weight32[i] = w; KERNEL_ASSIGN(out_data[i], req, w); } else { - mom_data[i] = param_momentum*mom_data[i] - + param_rescale_grad*static_cast(grad_data[i]) - + (param_wd*w); - w = w - param_lr*(param_momentum*mom_data[i] - + param_rescale_grad*static_cast(grad_data[i])); + mom_data[i] = param_momentum*mom_data[i]; + w = w-mom_data[i]+(param_momentum+1)*(mom_data[i]-param_lr + *(param_rescale_grad*static_cast(grad_data[i])+(param_wd*w))); weight32[i] = w; KERNEL_ASSIGN(out_data[i], req, w); } diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index 3e6cdd0997ce..d829e6a511f9 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -385,10 +385,10 @@ def update(self, index, weight, grad, state): else: mom = state mom[:] *= self.momentum - mom[:] += grad - mom[:] += wd * weight - grad[:] += self.momentum * mom - weight[:] -= lr * grad + weight[:] -= self.momentum * mom[:] + grad += wd * weight + grad *= lr + weight[:] -= (self.momentum + 1) * grad else: grad32 = array(grad, ctx=grad.context, dtype=np.float32) grad32 = grad32 * self.rescale_grad @@ -400,10 +400,10 @@ def update(self, index, weight, grad, state): weight32[:] += -lr * (grad32 + wd * weight32) else: mom[:] *= self.momentum - mom[:] += grad32 - mom[:] += wd * weight32 - grad32[:] += self.momentum * mom - weight32[:] -= lr * grad32 + weight32[:] -= self.momentum * mom[:] + grad32 += wd * weight32 + grad32 *= lr + weight32[:] -= (self.momentum + 1) * grad32 tmp = weight32.astype(weight.dtype) tmp.copyto(weight) From d2479d662b0442ec2691a65aea8aaf596a886679 Mon Sep 17 00:00:00 2001 From: Anirudh Acharya Date: Wed, 4 Sep 2019 18:11:00 +0000 Subject: [PATCH 2/3] readable updates in unit test --- tests/python/unittest/test_optimizer.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index d829e6a511f9..b30f0f36950b 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -384,11 +384,7 @@ def update(self, index, weight, grad, state): weight[:] += -lr * (grad + wd * weight) else: mom = state - mom[:] *= self.momentum - weight[:] -= self.momentum * mom[:] - grad += wd * weight - grad *= lr - weight[:] -= (self.momentum + 1) * grad + weight[:] += (self.momentum**2 * mom) - lr*(self.momentum + 1)*(grad + wd*weight) else: grad32 = array(grad, ctx=grad.context, dtype=np.float32) grad32 = grad32 * self.rescale_grad @@ -399,11 +395,7 @@ def update(self, index, weight, grad, state): if self.momentum == 0.0: weight32[:] += -lr * (grad32 + wd * weight32) else: - mom[:] *= self.momentum - weight32[:] -= self.momentum * mom[:] - grad32 += wd * weight32 - grad32 *= lr - weight32[:] -= (self.momentum + 1) * grad32 + weight32[:] += (self.momentum**2 * mom) - lr*(self.momentum+1)*(grad32 + wd*weight32) tmp = weight32.astype(weight.dtype) tmp.copyto(weight) From a58c8d43bbbd215f4211c3a25b14ab703bbca7ab Mon Sep 17 00:00:00 2001 From: Anirudh Acharya Date: Wed, 4 Sep 2019 21:41:27 +0000 Subject: [PATCH 3/3] mom update --- src/operator/optimizer_op-inl.h | 11 ++++++++++- tests/python/unittest/test_optimizer.py | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index af412b7fa667..d22bd644d21f 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -1069,11 +1069,15 @@ struct NAGMomKernel { mom_data[i] = param_momentum*mom_data[i]; KERNEL_ASSIGN(out_data[i], req, weight_data[i]-mom_data[i]+(param_momentum+1) *(mom_data[i]-(param_lr*(mshadow_op::clip::Map(param_rescale_grad - *grad_data[i],param_clip_gradient)+(param_wd*weight_data[i]))))); + *grad_data[i], param_clip_gradient)+(param_wd*weight_data[i]))))); + mom_data[i] = mom_data[i] - (param_lr*((mshadow_op::clip::Map(param_rescale_grad*grad_data[i], + param_clip_gradient))+(param_wd*weight_data[i]))); } else { mom_data[i] = param_momentum*mom_data[i]; KERNEL_ASSIGN(out_data[i], req, weight_data[i]-mom_data[i]+(param_momentum+1) *(mom_data[i]-(param_lr*(param_rescale_grad*grad_data[i]+param_wd*weight_data[i])))); + mom_data[i] = mom_data[i] - param_lr*((param_rescale_grad*grad_data[i]) + +(param_wd*weight_data[i])); } } }; @@ -1116,12 +1120,17 @@ struct MP_NAGMomKernel { w = w-mom_data[i]+(param_momentum+1)*(mom_data[i]-param_lr *(mshadow_op::clip::Map(param_rescale_grad*static_cast(grad_data[i]), param_clip_gradient)+(param_wd*w))); + mom_data[i] = mom_data[i] - param_lr + *((mshadow_op::clip::Map(param_rescale_grad*static_cast(grad_data[i]), + param_clip_gradient))+(param_wd*w)); weight32[i] = w; KERNEL_ASSIGN(out_data[i], req, w); } else { mom_data[i] = param_momentum*mom_data[i]; w = w-mom_data[i]+(param_momentum+1)*(mom_data[i]-param_lr *(param_rescale_grad*static_cast(grad_data[i])+(param_wd*w))); + mom_data[i] = mom_data[i] - param_lr + *((param_rescale_grad*static_cast(grad_data[i]))+(param_wd*w)); weight32[i] = w; KERNEL_ASSIGN(out_data[i], req, w); } diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index b30f0f36950b..cea469960f64 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -385,6 +385,7 @@ def update(self, index, weight, grad, state): else: mom = state weight[:] += (self.momentum**2 * mom) - lr*(self.momentum + 1)*(grad + wd*weight) + mom[:] = (self.momentum*mom) - lr*(grad + wd*weight) else: grad32 = array(grad, ctx=grad.context, dtype=np.float32) grad32 = grad32 * self.rescale_grad @@ -396,6 +397,7 @@ def update(self, index, weight, grad, state): weight32[:] += -lr * (grad32 + wd * weight32) else: weight32[:] += (self.momentum**2 * mom) - lr*(self.momentum+1)*(grad32 + wd*weight32) + mom[:] = (self.momentum*mom) - lr*(grad32 + wd*weight32) tmp = weight32.astype(weight.dtype) tmp.copyto(weight)