From 4328ad4468b247eef6a1505405d0e6a45c57ded0 Mon Sep 17 00:00:00 2001 From: dhc <8158859858@qq.com> Date: Mon, 4 Dec 2017 00:44:11 -0800 Subject: [PATCH] add new update method of adam --- python/mxnet/optimizer.py | 5 +++-- src/operator/convolution.cu | 2 +- src/operator/optimizer_op-inl.h | 24 ++++++++++++++++++++---- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py index 1ef9cc845036..dee09bf386a9 100644 --- a/python/mxnet/optimizer.py +++ b/python/mxnet/optimizer.py @@ -555,12 +555,13 @@ class Adam(Optimizer): epsilon : float, optional Small value to avoid division by 0. """ - def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, + def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,use_tusimple_update=False, **kwargs): super(Adam, self).__init__(learning_rate=learning_rate, **kwargs) self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon + self.use_tusimple_update = use_tusimple_update def create_state(self, index, weight): return (zeros(weight.shape, weight.context, dtype=weight.dtype), # mean @@ -579,7 +580,7 @@ def update(self, index, weight, grad, state): lr *= math.sqrt(coef2)/coef1 kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon, - 'rescale_grad': self.rescale_grad} + 'rescale_grad': self.rescale_grad, 'use_tusimple_update':self.use_tusimple_update} if self.clip_gradient: kwargs['clip_gradient'] = self.clip_gradient diff --git a/src/operator/convolution.cu b/src/operator/convolution.cu index f5777c1714a4..f3204eb312cb 100644 --- a/src/operator/convolution.cu +++ b/src/operator/convolution.cu @@ -103,7 +103,7 @@ Operator* CreateOp(ConvolutionParam param, int dtype, backward_compute_type, ctx); } if (!convolutionIsSupported) { - LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; + // LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; op = new ConvolutionOp(param); } else { if (forward_compute_type != desired_forward_compute_type) diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 70759b15251a..aa4af9edcba1 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -283,6 +283,7 @@ struct AdamParam : public dmlc::Parameter { float wd; float rescale_grad; float clip_gradient; + bool use_tusimple_update; DMLC_DECLARE_PARAMETER(AdamParam) { DMLC_DECLARE_FIELD(lr) .describe("Learning rate"); @@ -308,6 +309,9 @@ struct AdamParam : public dmlc::Parameter { .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] " "If clip_gradient <= 0, gradient clipping is turned off. " "grad = max(min(grad, clip_gradient), -clip_gradient)."); + DMLC_DECLARE_FIELD(use_tusimple_update) + .set_default(true) + .describe("whether use the gradient of weight decay when caculate mean & var"); } }; @@ -328,10 +332,13 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs, Tensor mean = inputs[2].FlatTo2D(s); Tensor var = inputs[3].FlatTo2D(s); Tensor out = outputs[0].FlatTo2D(s); - - grad = scalar(param.rescale_grad) * grad + - scalar(param.wd) * weight; - + if (param.use_tusimple_update == 0){ + grad = scalar(param.rescale_grad) * grad + + scalar(param.wd) * weight; + } + else{ + grad = scalar(param.rescale_grad) * grad; + } if (param.clip_gradient >= 0.0f) { mean = scalar(param.beta1)*mean + scalar(1.f-param.beta1) * F(grad, DType(param.clip_gradient)); @@ -341,10 +348,19 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs, mean = scalar(param.beta1)*mean + scalar(1.f-param.beta1) * grad; var = scalar(param.beta2)*var + scalar(1.f-param.beta2) * F(grad); } + if (param.use_tusimple_update == 0){ Assign(out, req[0], weight - scalar(param.lr) * mean / (F(var) + scalar(param.epsilon))); + } + else{ + Assign(out, req[0], + scalar(1.f-param.lr*param.wd)*weight - + scalar(param.lr) * mean / + (F(var) + scalar(param.epsilon))); + + } }); }