From 47da99f38a8101a797ccdf37fd3872bdd3dc202f Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Fri, 25 May 2018 18:26:46 +0000 Subject: [PATCH 1/4] Proximal Group Adagrad optimizer --- docs/api/python/index.md | 1 + docs/api/python/optimization/contrib.md | 52 +++ python/mxnet/optimizer/__init__.py | 23 ++ python/mxnet/optimizer/contrib.py | 145 ++++++++ python/mxnet/{ => optimizer}/optimizer.py | 20 +- python/mxnet/test_utils.py | 41 +++ src/operator/contrib/optimizer_op-inl.h | 323 ++++++++++++++++++ src/operator/contrib/optimizer_op.cc | 94 +++++ src/operator/contrib/optimizer_op.cu | 36 ++ .../python/unittest/test_contrib_optimizer.py | 122 +++++++ tests/python/unittest/test_optimizer.py | 37 -- 11 files changed, 850 insertions(+), 44 deletions(-) create mode 100644 docs/api/python/optimization/contrib.md create mode 100644 python/mxnet/optimizer/__init__.py create mode 100644 python/mxnet/optimizer/contrib.py rename python/mxnet/{ => optimizer}/optimizer.py (98%) create mode 100644 src/operator/contrib/optimizer_op-inl.h create mode 100644 src/operator/contrib/optimizer_op.cc create mode 100644 src/operator/contrib/optimizer_op.cu create mode 100644 tests/python/unittest/test_contrib_optimizer.py diff --git a/docs/api/python/index.md b/docs/api/python/index.md index 8f60bcd0f13c..de86aedff691 100644 --- a/docs/api/python/index.md +++ b/docs/api/python/index.md @@ -136,6 +136,7 @@ Code examples are placed throughout the API documentation and these can be run a :maxdepth: 1 optimization/optimization.md + optimization/contrib.md ``` ## Profiler API diff --git a/docs/api/python/optimization/contrib.md b/docs/api/python/optimization/contrib.md new file mode 100644 index 000000000000..9d3f3483113e --- /dev/null +++ b/docs/api/python/optimization/contrib.md @@ -0,0 +1,52 @@ +# Contrib Optimization API + +```eval_rst + .. currentmodule:: mxnet.optimizer.contrib +``` + +## Overview + +This document summaries the contrib APIs used to initialize and update the model +weights during training + +```eval_rst +.. autosummary:: + :nosignatures: + + mxnet.optimizer.contrib +``` + +The `Contrib Optimization` API, defined in the `optimizer.contrib` package, provides +many useful experimental APIs for new features. +This is a place for the community to try out the new features, +so that feature contributors can receive feedback. + +```eval_rst +.. warning:: This package contains experimental APIs and may change in the near future. +``` + +In the rest of this document, we list routines provided by the `optimizer.contrib` package. + +## Contrib + +```eval_rst +.. currentmodule:: mxnet.optimizer.contrib + +.. autosummary:: + :nosignatures: + + ProximalGroupAdaGrad +``` + +## API Reference + + + +```eval_rst + +.. automodule:: mxnet.optimizer.contrib + :members: + +``` + + diff --git a/python/mxnet/optimizer/__init__.py b/python/mxnet/optimizer/__init__.py new file mode 100644 index 000000000000..4840413ccaa6 --- /dev/null +++ b/python/mxnet/optimizer/__init__.py @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Optimizer API of MXNet.""" + +from . import optimizer, contrib +from .optimizer import * +# pylint: enable=wildcard-import + +__all__ = optimizer.__all__ + ['contrib'] diff --git a/python/mxnet/optimizer/contrib.py b/python/mxnet/optimizer/contrib.py new file mode 100644 index 000000000000..8cf48261036e --- /dev/null +++ b/python/mxnet/optimizer/contrib.py @@ -0,0 +1,145 @@ +# coding: utf-8 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=too-many-lines +"""Contrib optimizers.""" +from ..ndarray import (NDArray, clip, contrib, full, mean, norm, sparse, sqrt, + square, zeros) +from .optimizer import Optimizer + +# convenience wrapper for Optimizer.Register +register = Optimizer.register # pylint: disable=invalid-name + +__all__ = ['ProximalGroupAdaGrad'] + + +@register +class ProximalGroupAdaGrad(Optimizer): + """Proximal Adagrad optimizer with row-wise learning rates. + + This class implements the AdaGrad optimizer described in *Adaptive + Subgradient Methods for Online Learning and Stochastic Optimization*, and + available at http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf but + uses only a single learning rate for every row of the parameter array. + + This optimizer updates each weight by:: + + grad = clip(grad * rescale_grad, clip_gradient) + history += mean(square(grad), axis=1, keepdims=True) + div = grad / sqrt(history + float_stable_eps) + weight -= div * lr + + If `l2_regularization_strength > 0` a proximal operator is used to optimize + with group lasso objective. Weights are updated lazily if the gradient is + sparse. In particular, before using a set of weights for a forward pass, + you may want to ensure that the lazily accumulated group lasso + regularization is applied. This can be achieved by creating a sparse + gradient array that contains explicit 0 data for the indices to be updated: + + fake_grad = mx.nd.sparse.row_sparse_array( + (mx.nd.zeros((len(indices), dim)), indices)) + weight.grad()[:] = fake_grad + weight.data()._fresh_grad = True + trainer._optimizer._index_update_count[0] -= 1 + trainer._optimizer.num_update -= 1 + trainer.step(batch_size=1) + + For details of the update algorithm see + :class:`~mxnet.ndarray.contrib.proximal_group_adagrad_update`. + + This optimizer accepts the following parameters in addition to those + accepted by :class:`.Optimizer`. Weight decay is not supported. + + Parameters + ---------- + l2_regularization_strength : float + Strength of group lasso L2 regularization. + eps: float, optional + Initial value of the history accumulator. Avoids division by 0. + + """ + + def __init__(self, l2_regularization_strength=0.0, eps=1e-5, **kwargs): + super(ProximalGroupAdaGrad, self).__init__(**kwargs) + self.l2_regularization_strength = l2_regularization_strength + self.float_stable_eps = eps + + def create_state(self, index, weight): + assert len(weight.shape) == 2 + history = zeros( + (weight.shape[0], 1), weight.context, stype=weight.stype) + last_update = None + if self.l2_regularization_strength > 0: + last_update = full( + shape=(weight.shape[0], ), + val=self.num_update, + ctx=weight.context) + else: + last_update = zeros(1, ctx=weight.context) + return (history, last_update) + + def update(self, index, weight, grad, state): + assert (isinstance(weight, NDArray)) + assert (isinstance(grad, NDArray)) + self._update_count(index) + lr = self._get_lr(index) + wd = self._get_wd(index) + assert wd == 0, 'Weight decay is not supported for ProximalGroupAdaGrad' + + is_sparse = grad.stype == 'row_sparse' + history = state[0] + last_update = state[1] + if is_sparse: + kwargs = { + 'epsilon': self.float_stable_eps, + 'rescale_grad': self.rescale_grad + } + if self.clip_gradient: + kwargs['clip_gradient'] = self.clip_gradient + if self.l2_regularization_strength: + kwargs['l2_regularization_strength'] = \ + self.l2_regularization_strength + contrib.proximal_group_adagrad_update( + weight, + grad, + history, + out=weight, + last_update=last_update, + lr=lr, + current_update=self.num_update, + **kwargs) + elif self.l2_regularization_strength > 0: + grad = grad * self.rescale_grad + if self.clip_gradient is not None: + grad = clip(grad, -self.clip_gradient, self.clip_gradient) + history[:] += mean(square(grad), axis=1, keepdims=True) + div = lr * grad / sqrt(history + self.float_stable_eps) + num_skipped = (self.num_update - last_update).expand_dims(1) + scaled_l2 = lr / sqrt(history + self.float_stable_eps) \ + * self.l2_regularization_strength * num_skipped + nrm = norm(weight - div, ord=2, axis=1, keepdims=True) + weight[:] = (weight - div) * (1 - scaled_l2 / nrm) + weight[:] *= nrm > scaled_l2 + last_update[:] = self.num_update + else: + grad = grad * self.rescale_grad + if self.clip_gradient is not None: + grad = clip(grad, -self.clip_gradient, self.clip_gradient) + history[:] += mean(square(grad), axis=1, keepdims=True) + div = lr * grad / sqrt(history + self.float_stable_eps) + weight[:] -= div diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer/optimizer.py similarity index 98% rename from python/mxnet/optimizer.py rename to python/mxnet/optimizer/optimizer.py index b69d0c9af0dc..8f9cf366f09b 100644 --- a/python/mxnet/optimizer.py +++ b/python/mxnet/optimizer/optimizer.py @@ -23,13 +23,19 @@ import pickle import warnings import numpy -from .base import py_str -from .ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply) -from .ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update, - mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update, - signsgd_update, signum_update) -from .ndarray import sparse -from .random import normal +from ..base import py_str +from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply) +from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update, + mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update, + signsgd_update, signum_update) +from ..ndarray import sparse +from ..random import normal + +__all__ = [ + 'AdaDelta', 'AdaGrad', 'Adam', 'Adamax', 'DCASGD', 'FTML', 'Ftrl', 'LBSGD', + 'NAG', 'NDArray', 'NDabs', 'Nadam', 'Optimizer', 'RMSProp', 'SGD', 'SGLD', + 'Signum', 'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register' +] class Optimizer(object): diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py index c555b2fdfaf8..0bb28a0ef13a 100644 --- a/python/mxnet/test_utils.py +++ b/python/mxnet/test_utils.py @@ -1957,3 +1957,44 @@ def verify_generator(generator, buckets, probs, nsamples=1000000, nrepeat=5, suc % (str(cs_ret_l), str(obs_freq_l), str(expected_freq_l), str(buckets), str(probs))) return cs_ret_l + +def compare_ndarray_tuple(t1, t2, rtol=None, atol=None): + """Compare ndarray tuple.""" + if t1 is not None and t2 is not None: + if isinstance(t1, tuple): + for s1, s2 in zip(t1, t2): + compare_ndarray_tuple(s1, s2, rtol, atol) + else: + assert_almost_equal(t1.asnumpy(), t2.asnumpy(), rtol=rtol, atol=atol) + + +def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default', + rtol=1e-4, atol=1e-5, compare_states=True): + """Compare opt1 and opt2.""" + if w_stype == 'default': + w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype) + w1 = w2.copyto(default_context()) + elif w_stype == 'row_sparse' or w_stype == 'csr': + w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype) + w1 = w2.copyto(default_context()).tostype('default') + else: + raise Exception("type not supported yet") + if g_stype == 'default': + g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype) + g1 = g2.copyto(default_context()) + elif g_stype == 'row_sparse' or g_stype == 'csr': + g2 = rand_ndarray(shape, g_stype, dtype=dtype) + g1 = g2.copyto(default_context()).tostype('default') + else: + raise Exception("type not supported yet") + + state1 = opt1.create_state_multi_precision(0, w1) + state2 = opt2.create_state_multi_precision(0, w2) + if compare_states: + compare_ndarray_tuple(state1, state2) + + opt1.update_multi_precision(0, w1, g1, state1) + opt2.update_multi_precision(0, w2, g2, state2) + if compare_states: + compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol) + assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=rtol, atol=atol) diff --git a/src/operator/contrib/optimizer_op-inl.h b/src/operator/contrib/optimizer_op-inl.h new file mode 100644 index 000000000000..0bbe9cf7d1f4 --- /dev/null +++ b/src/operator/contrib/optimizer_op-inl.h @@ -0,0 +1,323 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2018 by Contributors + * \file optimizer_op-inl.h + * \brief Optimizer operators + * \author Leonard Lausen + */ +#ifndef MXNET_OPERATOR_CONTRIB_OPTIMIZER_OP_INL_H_ +#define MXNET_OPERATOR_CONTRIB_OPTIMIZER_OP_INL_H_ +#include +#include +#include +#include +#include +#include +#include +#include +#include "../elemwise_op_common.h" +#include "../mshadow_op.h" +#include "../mxnet_op.h" +#include "../operator_common.h" +#include "../tensor/init_op.h" +#include "../tensor/util/tensor_util-inl.h" + +namespace mxnet { +namespace op { + +struct ProximalGroupAdagradParam + : public dmlc::Parameter { + float lr; + float epsilon; + float rescale_grad; + float clip_gradient; + float l2_regularization_strength; + float current_update; + DMLC_DECLARE_PARAMETER(ProximalGroupAdagradParam) { + DMLC_DECLARE_FIELD(lr).describe("Learning rate"); + DMLC_DECLARE_FIELD(rescale_grad) + .set_default(1.0f) + .describe("Rescale gradient to grad = rescale_grad*grad."); + DMLC_DECLARE_FIELD(clip_gradient) + .set_default(-1.0f) + .describe( + "Clip gradient to the range of [-clip_gradient, clip_gradient] " + "If clip_gradient <= 0, gradient clipping is turned off. " + "grad = max(min(grad, clip_gradient), -clip_gradient)."); + DMLC_DECLARE_FIELD(l2_regularization_strength) + .set_default(0.0f) + .describe("Lambda term for group lasso objective."); + DMLC_DECLARE_FIELD(epsilon).set_default(1.0e-5).describe( + "Epsilon for numerical stability"); + DMLC_DECLARE_FIELD(current_update) + .set_default(0.0f) + .describe("Current update iteration for lazy update with group lasso " + "objective."); + } +}; + +inline bool ProximalGroupAdagradStorageType(const nnvm::NodeAttrs &attrs, + const int dev_mask, + DispatchMode *dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 4U); + CHECK_EQ(out_attrs->size(), 1U); + const int weight_stype = in_attrs->at(0); + const int grad_stype = in_attrs->at(1); + const int state_stype = in_attrs->at(2); + const int counter_stype = in_attrs->at(3); + bool dispatched = false; + if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) { + // dns, ... -> dns + dispatched = storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode, + DispatchMode::kFCompute); + } + if (!dispatched && grad_stype == kRowSparseStorage && + counter_stype == kDefaultStorage && + (weight_stype == kRowSparseStorage || weight_stype == kDefaultStorage) && + state_stype == weight_stype) { + // weight and state share stype, grad's stype = rsp + dispatched = storage_type_assign( + out_attrs, static_cast(weight_stype), dispatch_mode, + DispatchMode::kFComputeEx); + } + return dispatched; +} + +/*! \brief kernel for sparse adagrad update with group sparsity regularization + */ +template struct ProximalGroupAdagradDnsRspKernel { + template + MSHADOW_XINLINE static void + Map(int i, const index_t row_length, DType *out_data, DType *state_data, + DType *weight_data, const IType *grad_idx, const DType *grad_data, + DType *last_update_data, const DType current_update, + const DType clip_gradient, const DType rescale_grad, + const DType l2_regularization_strength, const DType lr, const DType eps) { + using namespace mshadow_op; + + // Helper to obtain index into weight / state arrays + auto get_data_j = [&i, &grad_idx, &row_length](index_t j) -> index_t { + return grad_idx[i] * row_length + j; + }; + // Helper to obtain explicit rescaled and clipped grad + auto get_grad_rescaled = [&i, &row_length, &grad_data, &rescale_grad, + &clip_gradient](index_t j) -> DType { + index_t grad_j = i * row_length + j; + DType grad_rescaled = grad_data[grad_j] * rescale_grad; + if (clip_gradient >= 0.0f) { + grad_rescaled = clip::Map(grad_rescaled, clip_gradient); + } + return grad_rescaled; + }; + + // Update history states + DType grad_ssq = 0; + for (index_t j = 0; j < row_length; j++) { + const DType grad_rescaled = get_grad_rescaled(j); + grad_ssq += grad_rescaled * grad_rescaled; + } + state_data[grad_idx[i]] += grad_ssq / row_length; + + // Number of weight updates skipped due to lazy_update + DType delay{0}; + if (l2_regularization_strength > 0) { + // last_update_data[grad_idx[i]] is only valid if + // l2_regularization_strength > 0. Otherwise may be out of bounds read. + delay = current_update - last_update_data[grad_idx[i]]; + last_update_data[grad_idx[i]] = current_update; + } + + if (l2_regularization_strength <= 0 || delay < 0) { + if (delay < 0) { + std::printf("Got invalid last_update in proximal_adagrad_update. " + "Using standard Adagrad update.\n"); + } + + // Standard Adagrad Update + for (index_t j = 0; j < row_length; j++) { + // clang-format off + const DType grad_rescaled = get_grad_rescaled(j); + index_t data_j = get_data_j(j); + const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps); + out_data[data_j] = weight_data[data_j] - div; + // clang-format on + } + } else { + // Compute L2 norm of updated parameter using scaled sum of squares + DType norm, scale; + mshadow_op::nrm2::SetInitValue(norm, scale); + for (index_t j = 0; j < row_length; j++) { + const DType grad_rescaled = get_grad_rescaled(j); + index_t data_j = get_data_j(j); + const DType val = + (weight_data[data_j] - + lr / std::sqrt(state_data[grad_idx[i]] + eps) * grad_rescaled); + mshadow_op::nrm2::Reduce(norm, val, scale); + } + mshadow_op::nrm2::Finalize(norm, scale); + + // Compute regularization lambda + DType lambda = l2_regularization_strength * lr / + square_root::Map(state_data[grad_idx[i]] + eps); + DType l2_scale = 1 - lambda / norm; + if (l2_scale < 0) { + l2_scale = 0; + } else if (l2_scale > 0) { + scale = math::pow(scale, delay); + } + + if (l2_scale == 0) { + // Soft threshold weights (proximal map for group lasso) + for (index_t j = 0; j < row_length; j++) { + index_t data_j = get_data_j(j); + out_data[data_j] = 0; + } + } else { + for (index_t j = 0; j < row_length; j++) { + // clang-format off + const DType grad_rescaled = get_grad_rescaled(j); + index_t data_j = get_data_j(j); + const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps); + out_data[data_j] = (weight_data[data_j] - div) * l2_scale; + // clang-format on + } + } + } + } +}; + +/* + * \brief Proximal Group Adagrad update implementation for dense weight and row_sparse grad. + */ +template +inline void ProximalGroupAdagradUpdateDnsRspDnsImpl( + const ProximalGroupAdagradParam ¶m, const OpContext &ctx, + const TBlob &weight, const NDArray &grad, const TBlob &state, + const TBlob &last_update, const OpReqType &req, TBlob *out) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mshadow_op; + using namespace mxnet_op; + Stream *s = ctx.get_stream(); + CHECK_EQ(grad.storage_type(), kRowSparseStorage); + // if gradients are zeros, no weights are updated + if (req == kNullOp) { + return; + } + CHECK_EQ(req, kWriteInplace) + << "kWriteInplace is expected for sparse proximal_adagrad_update"; + CHECK_GT(weight.shape_.Size(), 0); + CHECK_GT(state.shape_.Size(), 0); + + MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, { + DType *weight_data = weight.dptr(); + DType *out_data = out->dptr(); + const IType *grad_idx = grad.aux_data(rowsparse::kIdx).dptr(); + const DType *grad_val = grad.data().dptr(); + DType *state_data = state.dptr(); + DType *last_update_data = last_update.dptr(); + const nnvm::dim_t num_grad = grad.aux_shape(rowsparse::kIdx)[0]; + const auto row_length = weight.shape_.ProdShape(1, weight.ndim()); + + if (!grad.storage_initialized()) { + // Lazy update with 0 gradient + return; + } + + Kernel, xpu>::Launch( + s, num_grad, row_length, out_data, state_data, weight_data, grad_idx, + grad_val, last_update_data, static_cast(param.current_update), + static_cast(param.clip_gradient), + static_cast(param.rescale_grad), + static_cast(param.l2_regularization_strength), + static_cast(param.lr), static_cast(param.epsilon)); + }); + }); +} + +/* + * \brief Proximal adagrad update implementation for row_sparse grad. + * Both standard update and lazy update are supported. + */ +template +inline void ProximalGroupAdagradUpdateRspRspRspImpl( + const ProximalGroupAdagradParam ¶m, const OpContext &ctx, + const NDArray &weight, const NDArray &grad, const NDArray &state, + const NDArray &last_update_buffer, const OpReqType &req, NDArray *out) { + using namespace mshadow; + using namespace mxnet_op; + using namespace rowsparse; + CheckAllRowsPresent(weight, "ProximalGroupAdagradUpdate", "weights"); + Stream *s = ctx.get_stream(); + // fill history with zero values + if (!state.storage_initialized()) { + NDArray state_zeros = state; + FillDnsZerosRspImpl(s, &state_zeros); + } else { + CheckAllRowsPresent(state, "ProximalGroupAdagradUpdate", "states"); + } + // reuse dns rsp implementation when storage_shape == shape + TBlob out_blob = out->data(); + ProximalGroupAdagradUpdateDnsRspDnsImpl( + param, ctx, weight.data(), grad, state.data(), last_update_buffer.data(), + req, &out_blob); +} + +template +inline void ProximalGroupAdagradUpdateEx(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + const ProximalGroupAdagradParam ¶m = + nnvm::get(attrs.parsed); + const auto weight_stype = inputs[0].storage_type(); + const auto grad_stype = inputs[1].storage_type(); + const auto state_stype = inputs[2].storage_type(); + const auto counter_stype = inputs[3].storage_type(); + const auto output_stype = outputs[0].storage_type(); + + if (state_stype == weight_stype && output_stype == weight_stype && + weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage && + counter_stype == kDefaultStorage) { + NDArray out = outputs[0]; + ProximalGroupAdagradUpdateRspRspRspImpl( + param, ctx, inputs[0], inputs[1], inputs[2], inputs[3], req[0], &out); + } else if (state_stype == weight_stype && output_stype == weight_stype && + weight_stype == kDefaultStorage && + grad_stype == kRowSparseStorage && + counter_stype == kDefaultStorage) { + TBlob out_blob = outputs[0].data(); + ProximalGroupAdagradUpdateDnsRspDnsImpl( + param, ctx, inputs[0].data(), inputs[1], inputs[2].data(), + inputs[3].data(), req[0], &out_blob); + } else { + LogUnimplementedOp(attrs, ctx, inputs, req, outputs); + } +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_CONTRIB_OPTIMIZER_OP_INL_H_ diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc new file mode 100644 index 000000000000..278ec62eab63 --- /dev/null +++ b/src/operator/contrib/optimizer_op.cc @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2018 by Contributors + * \file optimizer_op.cc + * \brief Optimizer operators + * \author Leonard Lausen + */ +#include "./optimizer_op-inl.h" +#include "../elemwise_op_common.h" + +namespace mxnet { +namespace op { + +DMLC_REGISTER_PARAMETER(ProximalGroupAdagradParam); + +/*! + * \brief Shape inference function for Proximal Group AdaGrad. + */ +inline bool ProximalGroupAdagradShape(const nnvm::NodeAttrs &attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 4U); + CHECK_EQ(out_attrs->size(), 1U); + + SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0)); + SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(1)); + SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0)); + SHAPE_ASSIGN_CHECK(*in_attrs, 1, out_attrs->at(0)); + + return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U && + (in_attrs->at(0)[0] == in_attrs->at(1)[0]) && + (in_attrs->at(0)[0] == in_attrs->at(2)[0]); +} + +NNVM_REGISTER_OP(_contrib_proximal_group_adagrad_update) +.describe(R"code(Update function for Proximal Group AdaGrad optimizer. + +Referenced from *Adaptive Subgradient Methods for Online Learning and Stochastic Optimization*, +and available at http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf but +uses only a single learning rate for every row of the parameter array. + +Updates are applied by:: + + grad = clip(grad * rescale_grad, clip_gradient) + history += mean(square(grad), axis=1, keepdims=True) + div = grad / sqrt(history + float_stable_eps) + weight -= div * lr + +If `l2_regularization_strength > 0` a proximal operator is used to optimize with +group lasso objective. Weights are updated lazily if the gradient is sparse. +In particular, before using a set of weights for a forward pass, you may +want to ensure that the lazily accumulated group lasso regularization is +applied. + +Note that non-zero values for the weight decay option are not supported. + +)code" ADD_FILELINE) +.set_num_inputs(4) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FInferShape", ProximalGroupAdagradShape) +.set_attr("FInferType", ElemwiseType<4, 1>) +.set_attr("FInferStorageType", ProximalGroupAdagradStorageType) +.set_attr("FMutateInputs", + [](const nnvm::NodeAttrs& attrs) { + return std::vector{2, 3}; + }) +.set_attr("FComputeEx", ProximalGroupAdagradUpdateEx) +.add_argument("weight", "NDArray-or-Symbol", "Weight") +.add_argument("grad", "NDArray-or-Symbol", "Gradient") +.add_argument("history", "NDArray-or-Symbol", "History") +.add_argument("last_update", "NDArray-or-Symbol", "Array storing last update counter for each row.") +.add_arguments(ProximalGroupAdagradParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/optimizer_op.cu b/src/operator/contrib/optimizer_op.cu new file mode 100644 index 000000000000..49221e17c42c --- /dev/null +++ b/src/operator/contrib/optimizer_op.cu @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2018 by Contributors + * \file optimizer_op.cu + * \brief Optimizer operators + * \author Leonard Lausen + */ +#include "./optimizer_op-inl.h" +#include + +namespace mxnet { +namespace op { + +NNVM_REGISTER_OP(_contrib_proximal_group_adagrad_update) +.set_attr("FComputeEx", ProximalGroupAdagradUpdateEx); + +} // namespace op +} // namespace mxnet diff --git a/tests/python/unittest/test_contrib_optimizer.py b/tests/python/unittest/test_contrib_optimizer.py new file mode 100644 index 000000000000..71a50d8dc065 --- /dev/null +++ b/tests/python/unittest/test_contrib_optimizer.py @@ -0,0 +1,122 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import itertools + +import numpy as np + +import mxnet as mx +from mxnet.test_utils import * + + +# ProximalGroupAdaGrad +class PyProximalGroupAdaGrad(mx.optimizer.Optimizer): + """The python reference of Proximal Group AdaGrad optimizer. + + Parameters + ---------- + l2_regularization_strength : float + Strength of group lasso L2 regularization. + eps: float, optional + Small value to avoid division by 0. + + """ + + def __init__(self, l2_regularization_strength=0.0, eps=1e-5, **kwargs): + super(PyProximalGroupAdaGrad, self).__init__(**kwargs) + self.l2_regularization_strength = l2_regularization_strength + self.float_stable_eps = eps + + def create_state(self, index, weight): + assert len(weight.shape) == 2 + history = mx.nd.zeros( + (weight.shape[0], 1), weight.context, stype=weight.stype) + return history + + def update(self, index, weight, grad, state): + self._update_count(index) + lr = self._get_lr(index) + wd = self._get_wd(index) + assert wd == 0 + + history = state + grad = grad * self.rescale_grad + if self.clip_gradient is not None: + grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient) + history[:] += mx.nd.mean(mx.nd.square(grad), axis=1, keepdims=True) + div = lr * grad / mx.nd.sqrt(history + self.float_stable_eps) + + if self.l2_regularization_strength > 0: + scaled_l2 = lr / mx.nd.sqrt(history + self.float_stable_eps) \ + * self.l2_regularization_strength + norm = mx.nd.norm(weight - div, ord=2, axis=1, keepdims=True) + weight[:] = (weight - div) * \ + (1 - scaled_l2 / norm) + weight[:] *= norm > scaled_l2 + else: + weight[:] -= div + + +def test_proximal_group_adagrad(): + mx.random.seed(0) + opt1 = PyProximalGroupAdaGrad + opt2 = mx.optimizer.contrib.ProximalGroupAdaGrad + shape = (3, 4) + eps_options = [{}, {'eps': 1e-8}] + cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}] + rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}] + l2_options = [{ + 'l2_regularization_strength': 0.0 + }, { + 'l2_regularization_strength': 0.05 + }] + for dtype in [np.float32]: + for options in itertools.product(eps_options, cg_options, rg_options, + l2_options): + kwarg = dict(wd=0.0) + for option in options: + kwarg.update(option) + compare_optimizer( + opt1(**kwarg), + opt2(**kwarg), + shape, + dtype, + compare_states=False) + if kwarg.get('l2_regularization_strength', 0.0) == 0.0: + # By design results for PyOp which always performs + # dense update will differ if + # l2_regularization_strength > 0 + compare_optimizer( + opt1(**kwarg), + opt2(**kwarg), + shape, + dtype, + w_stype='row_sparse', + g_stype='row_sparse', + compare_states=False) + compare_optimizer( + opt1(**kwarg), + opt2(**kwarg), + shape, + dtype, + g_stype='row_sparse', + compare_states=False) + + +if __name__ == '__main__': + import nose + nose.runmodule() diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index 496a61f356b3..334b7d4c0fdb 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -71,43 +71,6 @@ def test_lr_wd_mult(): assert not mx.test_utils.almost_equal(args1['fc1_bias'], args2['fc1_bias'], 1e-1) assert not mx.test_utils.almost_equal(args1['fc2_weight'], args2['fc2_weight'], 1e-1) -def compare_ndarray_tuple(t1, t2, rtol=None, atol=None): - if t1 is not None and t2 is not None: - if isinstance(t1, tuple): - for s1, s2 in zip(t1, t2): - compare_ndarray_tuple(s1, s2, rtol, atol) - else: - assert_almost_equal(t1.asnumpy(), t2.asnumpy(), rtol=rtol, atol=atol) - - -def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default', - rtol=1e-4, atol=1e-5): - if w_stype == 'default': - w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype) - w1 = w2.copyto(default_context()) - elif w_stype == 'row_sparse' or w_stype == 'csr': - w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype) - w1 = w2.copyto(default_context()).tostype('default') - else: - raise Exception("type not supported yet") - if g_stype == 'default': - g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype) - g1 = g2.copyto(default_context()) - elif g_stype == 'row_sparse' or g_stype == 'csr': - g2 = rand_ndarray(shape, g_stype, dtype=dtype) - g1 = g2.copyto(default_context()).tostype('default') - else: - raise Exception("type not supported yet") - - state1 = opt1.create_state_multi_precision(0, w1) - state2 = opt2.create_state_multi_precision(0, w2) - compare_ndarray_tuple(state1, state2) - - opt1.update_multi_precision(0, w1, g1, state1) - opt2.update_multi_precision(0, w2, g2, state2) - compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol) - assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=rtol, atol=atol) - # SGD class PySGD(mx.optimizer.Optimizer): From be20af222228037c4fd3e4627694fe4389a42707 Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Mon, 1 Oct 2018 11:30:05 +0000 Subject: [PATCH 2/4] Remove proximal implementation and rename to GroupAdagrad --- docs/api/python/optimization/contrib.md | 2 +- python/mxnet/optimizer/__init__.py | 1 + python/mxnet/optimizer/contrib.py | 69 ++----- src/operator/contrib/optimizer_op-inl.h | 182 +++++------------- src/operator/contrib/optimizer_op.cc | 42 ++-- src/operator/contrib/optimizer_op.cu | 4 +- .../python/unittest/test_contrib_optimizer.py | 72 +++---- 7 files changed, 120 insertions(+), 252 deletions(-) diff --git a/docs/api/python/optimization/contrib.md b/docs/api/python/optimization/contrib.md index 9d3f3483113e..8fc261f4f052 100644 --- a/docs/api/python/optimization/contrib.md +++ b/docs/api/python/optimization/contrib.md @@ -35,7 +35,7 @@ In the rest of this document, we list routines provided by the `optimizer.contri .. autosummary:: :nosignatures: - ProximalGroupAdaGrad + GroupAdaGrad ``` ## API Reference diff --git a/python/mxnet/optimizer/__init__.py b/python/mxnet/optimizer/__init__.py index 4840413ccaa6..72eb5a741520 100644 --- a/python/mxnet/optimizer/__init__.py +++ b/python/mxnet/optimizer/__init__.py @@ -17,6 +17,7 @@ """Optimizer API of MXNet.""" from . import optimizer, contrib +# pylint: disable=wildcard-import from .optimizer import * # pylint: enable=wildcard-import diff --git a/python/mxnet/optimizer/contrib.py b/python/mxnet/optimizer/contrib.py index 8cf48261036e..1baf2ff1020a 100644 --- a/python/mxnet/optimizer/contrib.py +++ b/python/mxnet/optimizer/contrib.py @@ -18,19 +18,18 @@ # pylint: disable=too-many-lines """Contrib optimizers.""" -from ..ndarray import (NDArray, clip, contrib, full, mean, norm, sparse, sqrt, - square, zeros) +from ..ndarray import (NDArray, clip, contrib, mean, sqrt, square, zeros) from .optimizer import Optimizer # convenience wrapper for Optimizer.Register register = Optimizer.register # pylint: disable=invalid-name -__all__ = ['ProximalGroupAdaGrad'] +__all__ = ['GroupAdaGrad'] @register -class ProximalGroupAdaGrad(Optimizer): - """Proximal Adagrad optimizer with row-wise learning rates. +class GroupAdaGrad(Optimizer): + """Adagrad optimizer with row-wise learning rates. This class implements the AdaGrad optimizer described in *Adaptive Subgradient Methods for Online Learning and Stochastic Optimization*, and @@ -44,12 +43,11 @@ class ProximalGroupAdaGrad(Optimizer): div = grad / sqrt(history + float_stable_eps) weight -= div * lr - If `l2_regularization_strength > 0` a proximal operator is used to optimize - with group lasso objective. Weights are updated lazily if the gradient is - sparse. In particular, before using a set of weights for a forward pass, - you may want to ensure that the lazily accumulated group lasso - regularization is applied. This can be achieved by creating a sparse - gradient array that contains explicit 0 data for the indices to be updated: + Weights are updated lazily if the gradient is sparse. In particular, before + using a set of weights for a forward pass, you may want to ensure that the + lazily accumulated group lasso regularization is applied. This can be + achieved by creating a sparse gradient array that contains explicit 0 data + for the indices to be updated: fake_grad = mx.nd.sparse.row_sparse_array( (mx.nd.zeros((len(indices), dim)), indices)) @@ -60,38 +58,27 @@ class ProximalGroupAdaGrad(Optimizer): trainer.step(batch_size=1) For details of the update algorithm see - :class:`~mxnet.ndarray.contrib.proximal_group_adagrad_update`. + :class:`~mxnet.ndarray.contrib.group_adagrad_update`. This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`. Weight decay is not supported. Parameters ---------- - l2_regularization_strength : float - Strength of group lasso L2 regularization. eps: float, optional Initial value of the history accumulator. Avoids division by 0. """ - def __init__(self, l2_regularization_strength=0.0, eps=1e-5, **kwargs): - super(ProximalGroupAdaGrad, self).__init__(**kwargs) - self.l2_regularization_strength = l2_regularization_strength + def __init__(self, eps=1e-5, **kwargs): + super(GroupAdaGrad, self).__init__(**kwargs) self.float_stable_eps = eps def create_state(self, index, weight): assert len(weight.shape) == 2 history = zeros( (weight.shape[0], 1), weight.context, stype=weight.stype) - last_update = None - if self.l2_regularization_strength > 0: - last_update = full( - shape=(weight.shape[0], ), - val=self.num_update, - ctx=weight.context) - else: - last_update = zeros(1, ctx=weight.context) - return (history, last_update) + return history def update(self, index, weight, grad, state): assert (isinstance(weight, NDArray)) @@ -99,11 +86,9 @@ def update(self, index, weight, grad, state): self._update_count(index) lr = self._get_lr(index) wd = self._get_wd(index) - assert wd == 0, 'Weight decay is not supported for ProximalGroupAdaGrad' + assert wd == 0, 'Weight decay is not supported for GroupAdaGrad' is_sparse = grad.stype == 'row_sparse' - history = state[0] - last_update = state[1] if is_sparse: kwargs = { 'epsilon': self.float_stable_eps, @@ -111,35 +96,17 @@ def update(self, index, weight, grad, state): } if self.clip_gradient: kwargs['clip_gradient'] = self.clip_gradient - if self.l2_regularization_strength: - kwargs['l2_regularization_strength'] = \ - self.l2_regularization_strength - contrib.proximal_group_adagrad_update( + contrib.group_adagrad_update( weight, grad, - history, + state, out=weight, - last_update=last_update, lr=lr, - current_update=self.num_update, **kwargs) - elif self.l2_regularization_strength > 0: - grad = grad * self.rescale_grad - if self.clip_gradient is not None: - grad = clip(grad, -self.clip_gradient, self.clip_gradient) - history[:] += mean(square(grad), axis=1, keepdims=True) - div = lr * grad / sqrt(history + self.float_stable_eps) - num_skipped = (self.num_update - last_update).expand_dims(1) - scaled_l2 = lr / sqrt(history + self.float_stable_eps) \ - * self.l2_regularization_strength * num_skipped - nrm = norm(weight - div, ord=2, axis=1, keepdims=True) - weight[:] = (weight - div) * (1 - scaled_l2 / nrm) - weight[:] *= nrm > scaled_l2 - last_update[:] = self.num_update else: grad = grad * self.rescale_grad if self.clip_gradient is not None: grad = clip(grad, -self.clip_gradient, self.clip_gradient) - history[:] += mean(square(grad), axis=1, keepdims=True) - div = lr * grad / sqrt(history + self.float_stable_eps) + state[:] += mean(square(grad), axis=1, keepdims=True) + div = lr * grad / sqrt(state + self.float_stable_eps) weight[:] -= div diff --git a/src/operator/contrib/optimizer_op-inl.h b/src/operator/contrib/optimizer_op-inl.h index 0bbe9cf7d1f4..fd556a4231cb 100644 --- a/src/operator/contrib/optimizer_op-inl.h +++ b/src/operator/contrib/optimizer_op-inl.h @@ -43,15 +43,12 @@ namespace mxnet { namespace op { -struct ProximalGroupAdagradParam - : public dmlc::Parameter { +struct GroupAdagradParam : public dmlc::Parameter { float lr; float epsilon; float rescale_grad; float clip_gradient; - float l2_regularization_strength; - float current_update; - DMLC_DECLARE_PARAMETER(ProximalGroupAdagradParam) { + DMLC_DECLARE_PARAMETER(GroupAdagradParam) { DMLC_DECLARE_FIELD(lr).describe("Learning rate"); DMLC_DECLARE_FIELD(rescale_grad) .set_default(1.0f) @@ -62,29 +59,21 @@ struct ProximalGroupAdagradParam "Clip gradient to the range of [-clip_gradient, clip_gradient] " "If clip_gradient <= 0, gradient clipping is turned off. " "grad = max(min(grad, clip_gradient), -clip_gradient)."); - DMLC_DECLARE_FIELD(l2_regularization_strength) - .set_default(0.0f) - .describe("Lambda term for group lasso objective."); DMLC_DECLARE_FIELD(epsilon).set_default(1.0e-5).describe( "Epsilon for numerical stability"); - DMLC_DECLARE_FIELD(current_update) - .set_default(0.0f) - .describe("Current update iteration for lazy update with group lasso " - "objective."); } }; -inline bool ProximalGroupAdagradStorageType(const nnvm::NodeAttrs &attrs, - const int dev_mask, - DispatchMode *dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 4U); +inline bool GroupAdagradStorageType(const nnvm::NodeAttrs &attrs, + const int dev_mask, + DispatchMode *dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 3U); CHECK_EQ(out_attrs->size(), 1U); const int weight_stype = in_attrs->at(0); const int grad_stype = in_attrs->at(1); const int state_stype = in_attrs->at(2); - const int counter_stype = in_attrs->at(3); bool dispatched = false; if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) { // dns, ... -> dns @@ -92,7 +81,6 @@ inline bool ProximalGroupAdagradStorageType(const nnvm::NodeAttrs &attrs, DispatchMode::kFCompute); } if (!dispatched && grad_stype == kRowSparseStorage && - counter_stype == kDefaultStorage && (weight_stype == kRowSparseStorage || weight_stype == kDefaultStorage) && state_stype == weight_stype) { // weight and state share stype, grad's stype = rsp @@ -105,14 +93,13 @@ inline bool ProximalGroupAdagradStorageType(const nnvm::NodeAttrs &attrs, /*! \brief kernel for sparse adagrad update with group sparsity regularization */ -template struct ProximalGroupAdagradDnsRspKernel { +template struct GroupAdagradDnsRspKernel { template MSHADOW_XINLINE static void Map(int i, const index_t row_length, DType *out_data, DType *state_data, DType *weight_data, const IType *grad_idx, const DType *grad_data, - DType *last_update_data, const DType current_update, - const DType clip_gradient, const DType rescale_grad, - const DType l2_regularization_strength, const DType lr, const DType eps) { + const DType clip_gradient, const DType rescale_grad, const DType lr, + const DType eps) { using namespace mshadow_op; // Helper to obtain index into weight / state arrays @@ -138,82 +125,26 @@ template struct ProximalGroupAdagradDnsRspKernel { } state_data[grad_idx[i]] += grad_ssq / row_length; - // Number of weight updates skipped due to lazy_update - DType delay{0}; - if (l2_regularization_strength > 0) { - // last_update_data[grad_idx[i]] is only valid if - // l2_regularization_strength > 0. Otherwise may be out of bounds read. - delay = current_update - last_update_data[grad_idx[i]]; - last_update_data[grad_idx[i]] = current_update; - } - - if (l2_regularization_strength <= 0 || delay < 0) { - if (delay < 0) { - std::printf("Got invalid last_update in proximal_adagrad_update. " - "Using standard Adagrad update.\n"); - } - - // Standard Adagrad Update - for (index_t j = 0; j < row_length; j++) { - // clang-format off - const DType grad_rescaled = get_grad_rescaled(j); - index_t data_j = get_data_j(j); - const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps); - out_data[data_j] = weight_data[data_j] - div; - // clang-format on - } - } else { - // Compute L2 norm of updated parameter using scaled sum of squares - DType norm, scale; - mshadow_op::nrm2::SetInitValue(norm, scale); - for (index_t j = 0; j < row_length; j++) { - const DType grad_rescaled = get_grad_rescaled(j); - index_t data_j = get_data_j(j); - const DType val = - (weight_data[data_j] - - lr / std::sqrt(state_data[grad_idx[i]] + eps) * grad_rescaled); - mshadow_op::nrm2::Reduce(norm, val, scale); - } - mshadow_op::nrm2::Finalize(norm, scale); - - // Compute regularization lambda - DType lambda = l2_regularization_strength * lr / - square_root::Map(state_data[grad_idx[i]] + eps); - DType l2_scale = 1 - lambda / norm; - if (l2_scale < 0) { - l2_scale = 0; - } else if (l2_scale > 0) { - scale = math::pow(scale, delay); - } - - if (l2_scale == 0) { - // Soft threshold weights (proximal map for group lasso) - for (index_t j = 0; j < row_length; j++) { - index_t data_j = get_data_j(j); - out_data[data_j] = 0; - } - } else { - for (index_t j = 0; j < row_length; j++) { - // clang-format off - const DType grad_rescaled = get_grad_rescaled(j); - index_t data_j = get_data_j(j); - const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps); - out_data[data_j] = (weight_data[data_j] - div) * l2_scale; - // clang-format on - } - } + // Standard Adagrad Update + for (index_t j = 0; j < row_length; j++) { + // clang-format off + const DType grad_rescaled = get_grad_rescaled(j); + index_t data_j = get_data_j(j); + const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps); + out_data[data_j] = weight_data[data_j] - div; + // clang-format on } } }; /* - * \brief Proximal Group Adagrad update implementation for dense weight and row_sparse grad. + * \brief Group Adagrad update implementation for dense weight and row_sparse + * grad. */ template -inline void ProximalGroupAdagradUpdateDnsRspDnsImpl( - const ProximalGroupAdagradParam ¶m, const OpContext &ctx, - const TBlob &weight, const NDArray &grad, const TBlob &state, - const TBlob &last_update, const OpReqType &req, TBlob *out) { +inline void GroupAdagradUpdateDnsRspDnsImpl( + const GroupAdagradParam ¶m, const OpContext &ctx, const TBlob &weight, + const NDArray &grad, const TBlob &state, const OpReqType &req, TBlob *out) { using namespace mshadow; using namespace mshadow::expr; using namespace mshadow_op; @@ -225,7 +156,7 @@ inline void ProximalGroupAdagradUpdateDnsRspDnsImpl( return; } CHECK_EQ(req, kWriteInplace) - << "kWriteInplace is expected for sparse proximal_adagrad_update"; + << "kWriteInplace is expected for sparse group_adagrad_update"; CHECK_GT(weight.shape_.Size(), 0); CHECK_GT(state.shape_.Size(), 0); @@ -236,7 +167,6 @@ inline void ProximalGroupAdagradUpdateDnsRspDnsImpl( const IType *grad_idx = grad.aux_data(rowsparse::kIdx).dptr(); const DType *grad_val = grad.data().dptr(); DType *state_data = state.dptr(); - DType *last_update_data = last_update.dptr(); const nnvm::dim_t num_grad = grad.aux_shape(rowsparse::kIdx)[0]; const auto row_length = weight.shape_.ProdShape(1, weight.ndim()); @@ -245,73 +175,67 @@ inline void ProximalGroupAdagradUpdateDnsRspDnsImpl( return; } - Kernel, xpu>::Launch( + Kernel, xpu>::Launch( s, num_grad, row_length, out_data, state_data, weight_data, grad_idx, - grad_val, last_update_data, static_cast(param.current_update), - static_cast(param.clip_gradient), - static_cast(param.rescale_grad), - static_cast(param.l2_regularization_strength), - static_cast(param.lr), static_cast(param.epsilon)); + grad_val, static_cast(param.clip_gradient), + static_cast(param.rescale_grad), static_cast(param.lr), + static_cast(param.epsilon)); }); }); } /* - * \brief Proximal adagrad update implementation for row_sparse grad. - * Both standard update and lazy update are supported. + * \brief AdaGrad update implementation for row_sparse grad. Both standard + * update and lazy update are supported. */ template -inline void ProximalGroupAdagradUpdateRspRspRspImpl( - const ProximalGroupAdagradParam ¶m, const OpContext &ctx, - const NDArray &weight, const NDArray &grad, const NDArray &state, - const NDArray &last_update_buffer, const OpReqType &req, NDArray *out) { +inline void +GroupAdagradUpdateRspRspRspImpl(const GroupAdagradParam ¶m, + const OpContext &ctx, const NDArray &weight, + const NDArray &grad, const NDArray &state, + const OpReqType &req, NDArray *out) { using namespace mshadow; using namespace mxnet_op; using namespace rowsparse; - CheckAllRowsPresent(weight, "ProximalGroupAdagradUpdate", "weights"); + CheckAllRowsPresent(weight, "GroupAdagradUpdate", "weights"); Stream *s = ctx.get_stream(); // fill history with zero values if (!state.storage_initialized()) { NDArray state_zeros = state; FillDnsZerosRspImpl(s, &state_zeros); } else { - CheckAllRowsPresent(state, "ProximalGroupAdagradUpdate", "states"); + CheckAllRowsPresent(state, "GroupAdagradUpdate", "states"); } // reuse dns rsp implementation when storage_shape == shape TBlob out_blob = out->data(); - ProximalGroupAdagradUpdateDnsRspDnsImpl( - param, ctx, weight.data(), grad, state.data(), last_update_buffer.data(), - req, &out_blob); + GroupAdagradUpdateDnsRspDnsImpl(param, ctx, weight.data(), grad, + state.data(), req, &out_blob); } template -inline void ProximalGroupAdagradUpdateEx(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - const ProximalGroupAdagradParam ¶m = - nnvm::get(attrs.parsed); +inline void GroupAdagradUpdateEx(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + const GroupAdagradParam ¶m = nnvm::get(attrs.parsed); const auto weight_stype = inputs[0].storage_type(); const auto grad_stype = inputs[1].storage_type(); const auto state_stype = inputs[2].storage_type(); - const auto counter_stype = inputs[3].storage_type(); const auto output_stype = outputs[0].storage_type(); if (state_stype == weight_stype && output_stype == weight_stype && - weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage && - counter_stype == kDefaultStorage) { + weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage) { NDArray out = outputs[0]; - ProximalGroupAdagradUpdateRspRspRspImpl( - param, ctx, inputs[0], inputs[1], inputs[2], inputs[3], req[0], &out); + GroupAdagradUpdateRspRspRspImpl(param, ctx, inputs[0], inputs[1], + inputs[2], req[0], &out); } else if (state_stype == weight_stype && output_stype == weight_stype && weight_stype == kDefaultStorage && - grad_stype == kRowSparseStorage && - counter_stype == kDefaultStorage) { + grad_stype == kRowSparseStorage) { TBlob out_blob = outputs[0].data(); - ProximalGroupAdagradUpdateDnsRspDnsImpl( - param, ctx, inputs[0].data(), inputs[1], inputs[2].data(), - inputs[3].data(), req[0], &out_blob); + GroupAdagradUpdateDnsRspDnsImpl(param, ctx, inputs[0].data(), + inputs[1], inputs[2].data(), req[0], + &out_blob); } else { LogUnimplementedOp(attrs, ctx, inputs, req, outputs); } diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc index 278ec62eab63..3abc70d6fdf3 100644 --- a/src/operator/contrib/optimizer_op.cc +++ b/src/operator/contrib/optimizer_op.cc @@ -23,21 +23,21 @@ * \brief Optimizer operators * \author Leonard Lausen */ -#include "./optimizer_op-inl.h" #include "../elemwise_op_common.h" +#include "./optimizer_op-inl.h" namespace mxnet { namespace op { -DMLC_REGISTER_PARAMETER(ProximalGroupAdagradParam); +DMLC_REGISTER_PARAMETER(GroupAdagradParam); /*! - * \brief Shape inference function for Proximal Group AdaGrad. + * \brief Shape inference function for Group AdaGrad. */ -inline bool ProximalGroupAdagradShape(const nnvm::NodeAttrs &attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 4U); +inline bool GroupAdagradShape(const nnvm::NodeAttrs &attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 3U); CHECK_EQ(out_attrs->size(), 1U); SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0)); @@ -50,8 +50,8 @@ inline bool ProximalGroupAdagradShape(const nnvm::NodeAttrs &attrs, (in_attrs->at(0)[0] == in_attrs->at(2)[0]); } -NNVM_REGISTER_OP(_contrib_proximal_group_adagrad_update) -.describe(R"code(Update function for Proximal Group AdaGrad optimizer. +NNVM_REGISTER_OP(_contrib_group_adagrad_update) +.describe(R"code(Update function for Group AdaGrad optimizer. Referenced from *Adaptive Subgradient Methods for Online Learning and Stochastic Optimization*, and available at http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf but @@ -64,31 +64,29 @@ Updates are applied by:: div = grad / sqrt(history + float_stable_eps) weight -= div * lr -If `l2_regularization_strength > 0` a proximal operator is used to optimize with -group lasso objective. Weights are updated lazily if the gradient is sparse. -In particular, before using a set of weights for a forward pass, you may -want to ensure that the lazily accumulated group lasso regularization is -applied. +Weights are updated lazily if the gradient is sparse. In particular, before +using a set of weights for a forward pass, you may want to ensure that the +lazily accumulated group lasso regularization is applied. Note that non-zero values for the weight decay option are not supported. )code" ADD_FILELINE) -.set_num_inputs(4) +.set_num_inputs(3) .set_num_outputs(1) -.set_attr_parser(ParamParser) -.set_attr("FInferShape", ProximalGroupAdagradShape) -.set_attr("FInferType", ElemwiseType<4, 1>) -.set_attr("FInferStorageType", ProximalGroupAdagradStorageType) +.set_attr_parser(ParamParser) +.set_attr("FInferShape", GroupAdagradShape) +.set_attr("FInferType", ElemwiseType<3, 1>) +.set_attr("FInferStorageType", GroupAdagradStorageType) .set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { - return std::vector{2, 3}; + return std::vector{2}; }) -.set_attr("FComputeEx", ProximalGroupAdagradUpdateEx) +.set_attr("FComputeEx", GroupAdagradUpdateEx) .add_argument("weight", "NDArray-or-Symbol", "Weight") .add_argument("grad", "NDArray-or-Symbol", "Gradient") .add_argument("history", "NDArray-or-Symbol", "History") .add_argument("last_update", "NDArray-or-Symbol", "Array storing last update counter for each row.") -.add_arguments(ProximalGroupAdagradParam::__FIELDS__()); +.add_arguments(GroupAdagradParam::__FIELDS__()); } // namespace op } // namespace mxnet diff --git a/src/operator/contrib/optimizer_op.cu b/src/operator/contrib/optimizer_op.cu index 49221e17c42c..40d99c5f0071 100644 --- a/src/operator/contrib/optimizer_op.cu +++ b/src/operator/contrib/optimizer_op.cu @@ -29,8 +29,8 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_contrib_proximal_group_adagrad_update) -.set_attr("FComputeEx", ProximalGroupAdagradUpdateEx); +NNVM_REGISTER_OP(_contrib_group_adagrad_update) +.set_attr("FComputeEx", GroupAdagradUpdateEx); } // namespace op } // namespace mxnet diff --git a/tests/python/unittest/test_contrib_optimizer.py b/tests/python/unittest/test_contrib_optimizer.py index 71a50d8dc065..8ff8a7e1436b 100644 --- a/tests/python/unittest/test_contrib_optimizer.py +++ b/tests/python/unittest/test_contrib_optimizer.py @@ -23,22 +23,19 @@ from mxnet.test_utils import * -# ProximalGroupAdaGrad -class PyProximalGroupAdaGrad(mx.optimizer.Optimizer): - """The python reference of Proximal Group AdaGrad optimizer. +# * GroupAdaGrad +class PyGroupAdaGrad(mx.optimizer.Optimizer): + """The python reference of Group AdaGrad optimizer. Parameters ---------- - l2_regularization_strength : float - Strength of group lasso L2 regularization. eps: float, optional Small value to avoid division by 0. """ - def __init__(self, l2_regularization_strength=0.0, eps=1e-5, **kwargs): - super(PyProximalGroupAdaGrad, self).__init__(**kwargs) - self.l2_regularization_strength = l2_regularization_strength + def __init__(self, eps=1e-5, **kwargs): + super(PyGroupAdaGrad, self).__init__(**kwargs) self.float_stable_eps = eps def create_state(self, index, weight): @@ -59,34 +56,19 @@ def update(self, index, weight, grad, state): grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient) history[:] += mx.nd.mean(mx.nd.square(grad), axis=1, keepdims=True) div = lr * grad / mx.nd.sqrt(history + self.float_stable_eps) + weight[:] -= div - if self.l2_regularization_strength > 0: - scaled_l2 = lr / mx.nd.sqrt(history + self.float_stable_eps) \ - * self.l2_regularization_strength - norm = mx.nd.norm(weight - div, ord=2, axis=1, keepdims=True) - weight[:] = (weight - div) * \ - (1 - scaled_l2 / norm) - weight[:] *= norm > scaled_l2 - else: - weight[:] -= div - -def test_proximal_group_adagrad(): +def test_group_adagrad(): mx.random.seed(0) - opt1 = PyProximalGroupAdaGrad - opt2 = mx.optimizer.contrib.ProximalGroupAdaGrad + opt1 = PyGroupAdaGrad + opt2 = mx.optimizer.contrib.GroupAdaGrad shape = (3, 4) eps_options = [{}, {'eps': 1e-8}] cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}] rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}] - l2_options = [{ - 'l2_regularization_strength': 0.0 - }, { - 'l2_regularization_strength': 0.05 - }] for dtype in [np.float32]: - for options in itertools.product(eps_options, cg_options, rg_options, - l2_options): + for options in itertools.product(eps_options, cg_options, rg_options): kwarg = dict(wd=0.0) for option in options: kwarg.update(option) @@ -96,25 +78,21 @@ def test_proximal_group_adagrad(): shape, dtype, compare_states=False) - if kwarg.get('l2_regularization_strength', 0.0) == 0.0: - # By design results for PyOp which always performs - # dense update will differ if - # l2_regularization_strength > 0 - compare_optimizer( - opt1(**kwarg), - opt2(**kwarg), - shape, - dtype, - w_stype='row_sparse', - g_stype='row_sparse', - compare_states=False) - compare_optimizer( - opt1(**kwarg), - opt2(**kwarg), - shape, - dtype, - g_stype='row_sparse', - compare_states=False) + compare_optimizer( + opt1(**kwarg), + opt2(**kwarg), + shape, + dtype, + w_stype='row_sparse', + g_stype='row_sparse', + compare_states=False) + compare_optimizer( + opt1(**kwarg), + opt2(**kwarg), + shape, + dtype, + g_stype='row_sparse', + compare_states=False) if __name__ == '__main__': From f26549cfbeae8dd656a993de72fb2e3da3377924 Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Sat, 6 Oct 2018 08:57:13 +0000 Subject: [PATCH 3/4] Remove superfluous doc --- python/mxnet/optimizer/contrib.py | 14 +------------- src/operator/contrib/optimizer_op.cc | 4 +--- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/python/mxnet/optimizer/contrib.py b/python/mxnet/optimizer/contrib.py index 1baf2ff1020a..d269aa1bd069 100644 --- a/python/mxnet/optimizer/contrib.py +++ b/python/mxnet/optimizer/contrib.py @@ -43,19 +43,7 @@ class GroupAdaGrad(Optimizer): div = grad / sqrt(history + float_stable_eps) weight -= div * lr - Weights are updated lazily if the gradient is sparse. In particular, before - using a set of weights for a forward pass, you may want to ensure that the - lazily accumulated group lasso regularization is applied. This can be - achieved by creating a sparse gradient array that contains explicit 0 data - for the indices to be updated: - - fake_grad = mx.nd.sparse.row_sparse_array( - (mx.nd.zeros((len(indices), dim)), indices)) - weight.grad()[:] = fake_grad - weight.data()._fresh_grad = True - trainer._optimizer._index_update_count[0] -= 1 - trainer._optimizer.num_update -= 1 - trainer.step(batch_size=1) + Weights are updated lazily if the gradient is sparse. For details of the update algorithm see :class:`~mxnet.ndarray.contrib.group_adagrad_update`. diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc index 3abc70d6fdf3..31c07d7e3ee3 100644 --- a/src/operator/contrib/optimizer_op.cc +++ b/src/operator/contrib/optimizer_op.cc @@ -64,9 +64,7 @@ Updates are applied by:: div = grad / sqrt(history + float_stable_eps) weight -= div * lr -Weights are updated lazily if the gradient is sparse. In particular, before -using a set of weights for a forward pass, you may want to ensure that the -lazily accumulated group lasso regularization is applied. +Weights are updated lazily if the gradient is sparse. Note that non-zero values for the weight decay option are not supported. From df177c3fc98fc30d4daa967ef73a1095e1ca13e2 Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Thu, 11 Oct 2018 11:04:35 +0000 Subject: [PATCH 4/4] Remove superfluous argument --- src/operator/contrib/optimizer_op.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc index 31c07d7e3ee3..96f431bc569d 100644 --- a/src/operator/contrib/optimizer_op.cc +++ b/src/operator/contrib/optimizer_op.cc @@ -83,7 +83,6 @@ Note that non-zero values for the weight decay option are not supported. .add_argument("weight", "NDArray-or-Symbol", "Weight") .add_argument("grad", "NDArray-or-Symbol", "Gradient") .add_argument("history", "NDArray-or-Symbol", "History") -.add_argument("last_update", "NDArray-or-Symbol", "Array storing last update counter for each row.") .add_arguments(GroupAdagradParam::__FIELDS__()); } // namespace op