Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[Large Tensor] Add LT support for NN optimizers and 1 activation function #17444

Merged
merged 4 commits into from
Feb 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 41 additions & 38 deletions src/operator/optimizer_op-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,10 +225,10 @@ struct MultiSGDKernelParam {
template <typename MPDType, bool has_momentum, bool has_mixed_precision>
struct MultiSGDKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, const MultiSGDKernelParam<DType, MPDType>& param,
MSHADOW_XINLINE static void Map(index_t i, const MultiSGDKernelParam<DType, MPDType>& param,
const OpReqType req) {
for (int index = 0; index < param.count; ++index) {
if ((size_t)i < param.sizes[index]) {
if (i < static_cast<index_t>(param.sizes[index])) {
MPDType w = has_mixed_precision ? param.weights32[index][i] :
MPDType(param.weights[index][i]);
MPDType mom = has_momentum ? param.mom[index][i] : MPDType(0);
Expand Down Expand Up @@ -381,7 +381,7 @@ inline void MultiSGDMomUpdate(const nnvm::NodeAttrs& attrs,

struct SGDKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* weight_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* weight_data,
const DType* grad_data, const DType param_clip_gradient,
const DType param_lr, const DType param_wd, const DType param_rescale_grad,
const OpReqType req) {
Expand Down Expand Up @@ -429,9 +429,9 @@ struct SGDDnsRspKernel<req, gpu> {
// IType is row sparse idx type
// i is the ith element in row sparse gradient
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, const index_t row_length, DType* out, const DType* weight,
const IType* grad_idx, const DType *grad_val,
const DType clip_gradient, const DType lr,
MSHADOW_XINLINE static void Map(index_t i, const index_t row_length, DType* out,
const DType* weight, const IType* grad_idx,
const DType *grad_val, const DType clip_gradient, const DType lr,
const DType wd, const DType rescale_grad) {
using nnvm::dim_t;
using namespace mshadow_op;
Expand All @@ -457,9 +457,9 @@ struct SGDDnsRspKernel<req, cpu> {
// IType is row sparse idx type
// i is the ith row in row sparse gradient
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, const index_t row_length, DType* out, const DType* weight,
const IType* grad_idx, const DType *grad_val,
const DType clip_gradient, const DType lr,
MSHADOW_XINLINE static void Map(index_t i, const index_t row_length, DType* out,
const DType* weight, const IType* grad_idx,
const DType *grad_val, const DType clip_gradient, const DType lr,
const DType wd, const DType rescale_grad) {
for (index_t j = 0; j < row_length; j++) {
index_t data_i = grad_idx[i] * row_length + j;
Expand Down Expand Up @@ -600,10 +600,11 @@ struct SGDMomParam : public dmlc::Parameter<SGDMomParam> {

struct SGDMomKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data, DType* mom_data, const DType* weight_data,
const DType* grad_data, const DType param_clip_gradient, const DType param_momentum,
const DType param_lr, const DType param_wd, const DType param_rescale_grad,
const OpReqType req) {
MSHADOW_XINLINE static void Map(index_t i, DType* out_data, DType* mom_data,
const DType* weight_data, const DType* grad_data,
const DType param_clip_gradient, const DType param_momentum,
const DType param_lr, const DType param_wd,
const DType param_rescale_grad, const OpReqType req) {
if (param_clip_gradient >= 0.0f) {
mom_data[i] = param_momentum*mom_data[i]
- param_lr*param_wd*weight_data[i]
Expand Down Expand Up @@ -654,7 +655,7 @@ inline bool MP_InferType(const nnvm::NodeAttrs& attrs,

struct MP_SGDKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* weight_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* weight_data,
const DType* grad_data, float* weight32, const float param_clip_gradient,
const float param_lr, const float param_wd, const float param_rescale_grad,
const OpReqType req) {
Expand Down Expand Up @@ -698,7 +699,7 @@ inline void MP_SGDUpdate(const nnvm::NodeAttrs& attrs,

struct MP_SGDMomKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data, float* mom_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data, float* mom_data,
const DType* weight_data, const DType* grad_data, float* weight32,
const float param_clip_gradient, const float param_momentum, const float param_lr,
const float param_wd, const float param_rescale_grad, const OpReqType req) {
Expand Down Expand Up @@ -749,7 +750,7 @@ struct SGDMomDnsRspDnsKernel;
template<int req>
struct SGDMomDnsRspDnsKernel<req, cpu> {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, index_t row_length, DType* out_data,
DType* mom_data, const DType* weight_data, const IType* grad_idx,
const DType* grad_data, const DType clip_gradient, const DType momentum,
const DType lr, const DType wd, const DType rescale_grad) {
Expand All @@ -776,7 +777,7 @@ struct SGDMomDnsRspDnsKernel<req, cpu> {
template<int req>
struct SGDMomDnsRspDnsKernel<req, gpu> {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, index_t row_length, DType* out_data,
DType* mom_data, const DType* weight_data, const IType* grad_idx,
const DType* grad_data, const DType clip_gradient, const DType momentum,
const DType lr, const DType wd, const DType rescale_grad) {
Expand Down Expand Up @@ -1060,7 +1061,7 @@ struct NAGMomParam : public dmlc::Parameter<NAGMomParam> {

struct NAGMomKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data, DType* mom_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data, DType* mom_data,
const DType* weight_data, const DType* grad_data,
const DType param_clip_gradient, const DType param_momentum,
const DType param_lr, const DType param_wd,
Expand Down Expand Up @@ -1107,7 +1108,7 @@ inline void NAGMomUpdate(const nnvm::NodeAttrs& attrs,

struct MP_NAGMomKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data,
float* mom_data, const DType* weight_data,
const DType* grad_data, float* weight32,
const float param_clip_gradient,
Expand Down Expand Up @@ -1204,7 +1205,7 @@ struct FTMLParam : public dmlc::Parameter<FTMLParam> {

struct FTMLKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out, DType* weight, DType* grad,
MSHADOW_XINLINE static void Map(index_t i, DType* out, DType* weight, DType* grad,
DType* d, DType* v, DType* z, const DType lr, const DType beta1,
const DType beta2, const DType epsilon, const DType t,
const DType wd, const DType rescale_grad, const DType clip_grad,
Expand Down Expand Up @@ -1291,7 +1292,7 @@ struct AdamParam : public dmlc::Parameter<AdamParam> {

struct AdamUpdateKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data,
DType* mean_data, DType* var_data, const DType* weight_data, const DType* grad_data,
const DType clip_gradient, const DType rescale_grad,
const DType beta1, const DType beta2,
Expand Down Expand Up @@ -1350,7 +1351,7 @@ struct AdamDnsRspDnsKernel;
template<int req>
struct AdamDnsRspDnsKernel<req, cpu> {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, const nnvm::dim_t row_length, DType* out_data,
DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
const DType* grad_data, const DType clip_gradient, const DType beta1, const DType beta2,
const DType lr, const DType wd, const DType epsilon, const DType rescale_grad) {
Expand Down Expand Up @@ -1383,7 +1384,7 @@ struct AdamDnsRspDnsKernel<req, cpu> {
template<int req>
struct AdamDnsRspDnsKernel<req, gpu> {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, const nnvm::dim_t row_length, DType* out_data,
DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
const DType* grad_data, const DType clip_gradient, const DType beta1, const DType beta2,
const DType lr, const DType wd, const DType epsilon, const DType rescale_grad) {
Expand Down Expand Up @@ -1620,7 +1621,7 @@ struct LambUpdatePhaseTwoParam : public dmlc::Parameter<LambUpdatePhaseTwoParam>

struct LambUpdatePhaseOneKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data,
ChaiBapchya marked this conversation as resolved.
Show resolved Hide resolved
DType* mean_data, DType* var_data, const DType* weight_data, const DType* grad_data,
const DType clip_gradient, const DType rescale_grad,
const DType beta1, const DType beta1_t, const DType beta2, const DType beta2_t,
Expand Down Expand Up @@ -1704,7 +1705,7 @@ inline bool LambUpdatePhaseTwoShape(const nnvm::NodeAttrs& attrs,

struct LambUpdatePhaseTwoKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data,
const DType* weight_data, const DType* g,
const DType* r1, const DType* r2,
DType lr, const DType lower_bound,
Expand Down Expand Up @@ -1771,7 +1772,7 @@ inline bool MPLambPhaseOneType(const nnvm::NodeAttrs& attrs,

struct MPLambUpdatePhaseOneKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, float* out_data,
MSHADOW_XINLINE static void Map(index_t i, float* out_data,
float* mean_data, float* var_data, const DType* weight_data,
const DType* grad_data, const float* weight32_data,
const float clip_gradient, const float rescale_grad,
Expand Down Expand Up @@ -1861,7 +1862,7 @@ inline bool MPLambUpdatePhaseTwoShape(const nnvm::NodeAttrs& attrs,

struct MPLambUpdatePhaseTwoKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data,
const DType* weight_data, const float* g,
const float* r1, const float* r2, const float* weight32_data,
float lr, const float lower_bound,
Expand Down Expand Up @@ -1952,7 +1953,7 @@ struct RMSPropAlexParam : public dmlc::Parameter<RMSPropAlexParam> {

struct RMSPropAlexUpdateKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data,
DType* state_n_data, DType* state_g_data, DType* delta_data,
const DType* weight_data, const DType* grad_data,
const DType clip_gradient, const DType rescale_grad,
Expand Down Expand Up @@ -2051,7 +2052,7 @@ struct RMSPropParam : public dmlc::Parameter<RMSPropParam> {

struct RMSPropUpdateKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i,
MSHADOW_XINLINE static void Map(index_t i,
DType* out_data, DType* state_n_data,
const DType* weight_data, const DType* grad_data,
const DType clip_gradient, const DType rescale_grad,
Expand Down Expand Up @@ -2132,7 +2133,7 @@ struct FtrlParam : public dmlc::Parameter<FtrlParam> {

struct FtrlUpdateKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data,
DType* n_data, DType* z_data, const DType* weight_data, const DType* grad_data,
const DType clip_gradient, const DType rescale_grad,
const DType beta, const DType lamda1,
Expand Down Expand Up @@ -2185,7 +2186,7 @@ inline void FtrlUpdate(const nnvm::NodeAttrs& attrs,
template<int req>
struct FtrlDnsRspDnsKernel {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, const nnvm::dim_t row_length, DType* out_data,
DType* z_data, DType* n_data, const DType* weight_data, const IType* grad_idx,
const DType* grad_data, const DType clip_gradient, const DType lamda1, const DType beta,
const DType lr, const DType wd, const DType rescale_grad) {
Expand Down Expand Up @@ -2343,7 +2344,7 @@ struct SignSGDParam : public dmlc::Parameter<SignSGDParam> {

struct SignSGDKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* weight_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* weight_data,
const DType* grad_data, const DType param_clip_gradient,
const DType param_lr, const DType param_wd, const DType param_rescale_grad,
const OpReqType req) {
Expand Down Expand Up @@ -2411,10 +2412,12 @@ struct SignumParam : public dmlc::Parameter<SignumParam> {

struct SignumKernel {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data, DType* mom_data, const DType* weight_data,
const DType* grad_data, const DType param_clip_gradient, const DType param_momentum,
const DType param_lr, const DType param_wd, const DType param_rescale_grad,
const DType param_wd_lh, const OpReqType req) {
MSHADOW_XINLINE static void Map(index_t i, DType* out_data, DType* mom_data,
const DType* weight_data, const DType* grad_data,
const DType param_clip_gradient, const DType param_momentum,
const DType param_lr, const DType param_wd,
const DType param_rescale_grad, const DType param_wd_lh,
const OpReqType req) {
if (param_clip_gradient >= 0.0f) {
mom_data[i] = param_momentum*mom_data[i]
- (1-param_momentum)*param_wd*weight_data[i]
Expand Down Expand Up @@ -2506,7 +2509,7 @@ struct AdagradDnsRspDnsKernel;
template<>
struct AdagradDnsRspDnsKernel<cpu> {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, index_t row_length, DType* out_data,
DType* state_data, const DType* weight_data, const IType* grad_idx,
const DType* grad_data, const DType clip_gradient, const DType epsilon,
const DType lr, const DType rescale_grad) {
Expand All @@ -2533,7 +2536,7 @@ struct AdagradDnsRspDnsKernel<cpu> {
template<>
struct AdagradDnsRspDnsKernel<gpu> {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
MSHADOW_XINLINE static void Map(index_t i, index_t row_length, DType* out_data,
DType* state_data, const DType* weight_data, const IType* grad_idx,
const DType* grad_data, const DType clip_gradient, const DType epsilon,
const DType lr, const DType rescale_grad) {
Expand Down
4 changes: 2 additions & 2 deletions src/operator/tensor/elemwise_unary_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ struct HardSigmoidParam : public dmlc::Parameter<HardSigmoidParam> {
template<int req>
struct hard_sigmoid_forward {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data,
const real_t alpha, const real_t beta) {
DType result = DType(alpha * in_data[i] + beta);
result = (DType(1) < result) ? DType(1) : result;
Expand All @@ -507,7 +507,7 @@ struct hard_sigmoid_forward {
template<int req>
struct hard_sigmoid_backward {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, DType* in_grad, const DType* in_data,
MSHADOW_XINLINE static void Map(index_t i, DType* in_grad, const DType* in_data,
const DType* out_grad, const real_t alpha, const real_t beta) {
DType out_val = DType(alpha) * in_data[i] + DType(beta);
DType grad = (out_val > DType(0) && out_val < DType(1)) ?
Expand Down