diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py index d290a3f2fea2..6ffbbcffc384 100644 --- a/python/mxnet/optimizer/optimizer.py +++ b/python/mxnet/optimizer/optimizer.py @@ -978,10 +978,10 @@ def update(self, index, weight, grad, state): if state is not None: mom = state mom[:] *= self.momentum - grad += wd * weight mom[:] += grad + mom[:] += wd * weight grad[:] += self.momentum * mom - weight[:] += -lr * grad + weight[:] -= lr * grad else: assert self.momentum == 0.0 weight[:] += -lr * (grad + wd * weight) diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index 935bd9ab1823..3fdd1cd6bb87 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -385,10 +385,10 @@ def update(self, index, weight, grad, state): else: mom = state mom[:] *= self.momentum - grad += wd * weight mom[:] += grad + mom[:] += wd * weight grad[:] += self.momentum * mom - weight[:] += -lr * grad + weight[:] -= lr * grad else: grad32 = array(grad, ctx=grad.context, dtype=np.float32) grad32 = grad32 * self.rescale_grad @@ -400,10 +400,10 @@ def update(self, index, weight, grad, state): weight32[:] += -lr * (grad32 + wd * weight32) else: mom[:] *= self.momentum - grad32 += wd * weight32 mom[:] += grad32 + mom[:] += wd * weight32 grad32[:] += self.momentum * mom - weight32[:] += -lr * grad32 + weight32[:] -= lr * grad32 tmp = weight32.astype(weight.dtype) tmp.copyto(weight)