From 47da99f38a8101a797ccdf37fd3872bdd3dc202f Mon Sep 17 00:00:00 2001
From: Leonard Lausen <leonard@lausen.nl>
Date: Fri, 25 May 2018 18:26:46 +0000
Subject: [PATCH 1/4] Proximal Group Adagrad optimizer

---
 docs/api/python/index.md                      |   1 +
 docs/api/python/optimization/contrib.md       |  52 +++
 python/mxnet/optimizer/__init__.py            |  23 ++
 python/mxnet/optimizer/contrib.py             | 145 ++++++++
 python/mxnet/{ => optimizer}/optimizer.py     |  20 +-
 python/mxnet/test_utils.py                    |  41 +++
 src/operator/contrib/optimizer_op-inl.h       | 323 ++++++++++++++++++
 src/operator/contrib/optimizer_op.cc          |  94 +++++
 src/operator/contrib/optimizer_op.cu          |  36 ++
 .../python/unittest/test_contrib_optimizer.py | 122 +++++++
 tests/python/unittest/test_optimizer.py       |  37 --
 11 files changed, 850 insertions(+), 44 deletions(-)
 create mode 100644 docs/api/python/optimization/contrib.md
 create mode 100644 python/mxnet/optimizer/__init__.py
 create mode 100644 python/mxnet/optimizer/contrib.py
 rename python/mxnet/{ => optimizer}/optimizer.py (98%)
 create mode 100644 src/operator/contrib/optimizer_op-inl.h
 create mode 100644 src/operator/contrib/optimizer_op.cc
 create mode 100644 src/operator/contrib/optimizer_op.cu
 create mode 100644 tests/python/unittest/test_contrib_optimizer.py
diff --git a/docs/api/python/index.md b/docs/api/python/index.md
index 8f60bcd0f13c..de86aedff691 100644
--- a/docs/api/python/index.md
+++ b/docs/api/python/index.md
@@ -136,6 +136,7 @@ Code examples are placed throughout the API documentation and these can be run a
    :maxdepth: 1
 
    optimization/optimization.md
+   optimization/contrib.md
 ```
 
 ## Profiler API
diff --git a/docs/api/python/optimization/contrib.md b/docs/api/python/optimization/contrib.md
new file mode 100644
index 000000000000..9d3f3483113e
--- /dev/null
+++ b/docs/api/python/optimization/contrib.md
@@ -0,0 +1,52 @@
+# Contrib Optimization API
+
+```eval_rst
+    .. currentmodule:: mxnet.optimizer.contrib
+```
+
+## Overview
+
+This document summaries the contrib APIs used to initialize and update the model
+weights during training
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.optimizer.contrib
+```
+
+The `Contrib Optimization` API, defined in the `optimizer.contrib` package, provides
+many useful experimental APIs for new features.
+This is a place for the community to try out the new features,
+so that feature contributors can receive feedback.
+
+```eval_rst
+.. warning:: This package contains experimental APIs and may change in the near future.
+```
+
+In the rest of this document, we list routines provided by the `optimizer.contrib` package.
+
+## Contrib
+
+```eval_rst
+.. currentmodule:: mxnet.optimizer.contrib
+
+.. autosummary::
+    :nosignatures:
+
+     ProximalGroupAdaGrad
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+
+.. automodule:: mxnet.optimizer.contrib
+    :members:
+
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/python/mxnet/optimizer/__init__.py b/python/mxnet/optimizer/__init__.py
new file mode 100644
index 000000000000..4840413ccaa6
--- /dev/null
+++ b/python/mxnet/optimizer/__init__.py
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Optimizer API of MXNet."""
+
+from . import optimizer, contrib
+from .optimizer import *
+# pylint: enable=wildcard-import
+
+__all__ = optimizer.__all__ + ['contrib']
diff --git a/python/mxnet/optimizer/contrib.py b/python/mxnet/optimizer/contrib.py
new file mode 100644
index 000000000000..8cf48261036e
--- /dev/null
+++ b/python/mxnet/optimizer/contrib.py
@@ -0,0 +1,145 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Contrib optimizers."""
+from ..ndarray import (NDArray, clip, contrib, full, mean, norm, sparse, sqrt,
+                       square, zeros)
+from .optimizer import Optimizer
+
+# convenience wrapper for Optimizer.Register
+register = Optimizer.register  # pylint: disable=invalid-name
+
+__all__ = ['ProximalGroupAdaGrad']
+
+
+@register
+class ProximalGroupAdaGrad(Optimizer):
+    """Proximal Adagrad optimizer with row-wise learning rates.
+
+    This class implements the AdaGrad optimizer described in *Adaptive
+    Subgradient Methods for Online Learning and Stochastic Optimization*, and
+    available at http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf but
+    uses only a single learning rate for every row of the parameter array.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient)
+        history += mean(square(grad), axis=1, keepdims=True)
+        div = grad / sqrt(history + float_stable_eps)
+        weight -= div * lr
+
+    If `l2_regularization_strength > 0` a proximal operator is used to optimize
+    with group lasso objective. Weights are updated lazily if the gradient is
+    sparse. In particular, before using a set of weights for a forward pass,
+    you may want to ensure that the lazily accumulated group lasso
+    regularization is applied. This can be achieved by creating a sparse
+    gradient array that contains explicit 0 data for the indices to be updated:
+
+        fake_grad = mx.nd.sparse.row_sparse_array(
+            (mx.nd.zeros((len(indices), dim)), indices))
+        weight.grad()[:] = fake_grad
+        weight.data()._fresh_grad = True
+        trainer._optimizer._index_update_count[0] -= 1
+        trainer._optimizer.num_update -= 1
+        trainer.step(batch_size=1)
+
+    For details of the update algorithm see
+    :class:`~mxnet.ndarray.contrib.proximal_group_adagrad_update`.
+
+    This optimizer accepts the following parameters in addition to those
+    accepted by :class:`.Optimizer`. Weight decay is not supported.
+
+    Parameters
+    ----------
+    l2_regularization_strength : float
+       Strength of group lasso L2 regularization.
+    eps: float, optional
+        Initial value of the history accumulator. Avoids division by 0.
+
+    """
+
+    def __init__(self, l2_regularization_strength=0.0, eps=1e-5, **kwargs):
+        super(ProximalGroupAdaGrad, self).__init__(**kwargs)
+        self.l2_regularization_strength = l2_regularization_strength
+        self.float_stable_eps = eps
+
+    def create_state(self, index, weight):
+        assert len(weight.shape) == 2
+        history = zeros(
+            (weight.shape[0], 1), weight.context, stype=weight.stype)
+        last_update = None
+        if self.l2_regularization_strength > 0:
+            last_update = full(
+                shape=(weight.shape[0], ),
+                val=self.num_update,
+                ctx=weight.context)
+        else:
+            last_update = zeros(1, ctx=weight.context)
+        return (history, last_update)
+
+    def update(self, index, weight, grad, state):
+        assert (isinstance(weight, NDArray))
+        assert (isinstance(grad, NDArray))
+        self._update_count(index)
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
+        assert wd == 0, 'Weight decay is not supported for ProximalGroupAdaGrad'
+
+        is_sparse = grad.stype == 'row_sparse'
+        history = state[0]
+        last_update = state[1]
+        if is_sparse:
+            kwargs = {
+                'epsilon': self.float_stable_eps,
+                'rescale_grad': self.rescale_grad
+            }
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+            if self.l2_regularization_strength:
+                kwargs['l2_regularization_strength'] = \
+                    self.l2_regularization_strength
+            contrib.proximal_group_adagrad_update(
+                weight,
+                grad,
+                history,
+                out=weight,
+                last_update=last_update,
+                lr=lr,
+                current_update=self.num_update,
+                **kwargs)
+        elif self.l2_regularization_strength > 0:
+            grad = grad * self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            history[:] += mean(square(grad), axis=1, keepdims=True)
+            div = lr * grad / sqrt(history + self.float_stable_eps)
+            num_skipped = (self.num_update - last_update).expand_dims(1)
+            scaled_l2 = lr / sqrt(history + self.float_stable_eps) \
+                * self.l2_regularization_strength * num_skipped
+            nrm = norm(weight - div, ord=2, axis=1, keepdims=True)
+            weight[:] = (weight - div) * (1 - scaled_l2 / nrm)
+            weight[:] *= nrm > scaled_l2
+            last_update[:] = self.num_update
+        else:
+            grad = grad * self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            history[:] += mean(square(grad), axis=1, keepdims=True)
+            div = lr * grad / sqrt(history + self.float_stable_eps)
+            weight[:] -= div
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer/optimizer.py
similarity index 98%
rename from python/mxnet/optimizer.py
rename to python/mxnet/optimizer/optimizer.py
index b69d0c9af0dc..8f9cf366f09b 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -23,13 +23,19 @@
 import pickle
 import warnings
 import numpy
-from .base import py_str
-from .ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply)
-from .ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
-                      mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
-                      signsgd_update, signum_update)
-from .ndarray import sparse
-from .random import normal
+from ..base import py_str
+from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply)
+from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
+                       mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
+                       signsgd_update, signum_update)
+from ..ndarray import sparse
+from ..random import normal
+
+__all__ = [
+    'AdaDelta', 'AdaGrad', 'Adam', 'Adamax', 'DCASGD', 'FTML', 'Ftrl', 'LBSGD',
+    'NAG', 'NDArray', 'NDabs', 'Nadam', 'Optimizer', 'RMSProp', 'SGD', 'SGLD',
+    'Signum', 'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register'
+]
 
 
 class Optimizer(object):
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index c555b2fdfaf8..0bb28a0ef13a 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1957,3 +1957,44 @@ def verify_generator(generator, buckets, probs, nsamples=1000000, nrepeat=5, suc
                              % (str(cs_ret_l), str(obs_freq_l), str(expected_freq_l),
                                 str(buckets), str(probs)))
     return cs_ret_l
+
+def compare_ndarray_tuple(t1, t2, rtol=None, atol=None):
+    """Compare ndarray tuple."""
+    if t1 is not None and t2 is not None:
+        if isinstance(t1, tuple):
+            for s1, s2 in zip(t1, t2):
+                compare_ndarray_tuple(s1, s2, rtol, atol)
+        else:
+            assert_almost_equal(t1.asnumpy(), t2.asnumpy(), rtol=rtol, atol=atol)
+
+
+def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default',
+                      rtol=1e-4, atol=1e-5, compare_states=True):
+    """Compare opt1 and opt2."""
+    if w_stype == 'default':
+        w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+        w1 = w2.copyto(default_context())
+    elif w_stype == 'row_sparse' or w_stype == 'csr':
+        w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
+        w1 = w2.copyto(default_context()).tostype('default')
+    else:
+        raise Exception("type not supported yet")
+    if g_stype == 'default':
+        g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+        g1 = g2.copyto(default_context())
+    elif g_stype == 'row_sparse' or g_stype == 'csr':
+        g2 = rand_ndarray(shape, g_stype, dtype=dtype)
+        g1 = g2.copyto(default_context()).tostype('default')
+    else:
+        raise Exception("type not supported yet")
+
+    state1 = opt1.create_state_multi_precision(0, w1)
+    state2 = opt2.create_state_multi_precision(0, w2)
+    if compare_states:
+        compare_ndarray_tuple(state1, state2)
+
+    opt1.update_multi_precision(0, w1, g1, state1)
+    opt2.update_multi_precision(0, w2, g2, state2)
+    if compare_states:
+        compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol)
+    assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=rtol, atol=atol)
diff --git a/src/operator/contrib/optimizer_op-inl.h b/src/operator/contrib/optimizer_op-inl.h
new file mode 100644
index 000000000000..0bbe9cf7d1f4
--- /dev/null
+++ b/src/operator/contrib/optimizer_op-inl.h
@@ -0,0 +1,323 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file optimizer_op-inl.h
+ * \brief Optimizer operators
+ * \author Leonard Lausen
+ */
+#ifndef MXNET_OPERATOR_CONTRIB_OPTIMIZER_OP_INL_H_
+#define MXNET_OPERATOR_CONTRIB_OPTIMIZER_OP_INL_H_
+#include <dmlc/parameter.h>
+#include <mshadow/base.h>
+#include <mxnet/op_attr_types.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+#include <nnvm/op.h>
+#include <nnvm/op_attr_types.h>
+#include <vector>
+#include "../elemwise_op_common.h"
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "../tensor/init_op.h"
+#include "../tensor/util/tensor_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct ProximalGroupAdagradParam
+    : public dmlc::Parameter<ProximalGroupAdagradParam> {
+  float lr;
+  float epsilon;
+  float rescale_grad;
+  float clip_gradient;
+  float l2_regularization_strength;
+  float current_update;
+  DMLC_DECLARE_PARAMETER(ProximalGroupAdagradParam) {
+    DMLC_DECLARE_FIELD(lr).describe("Learning rate");
+    DMLC_DECLARE_FIELD(rescale_grad)
+        .set_default(1.0f)
+        .describe("Rescale gradient to grad = rescale_grad*grad.");
+    DMLC_DECLARE_FIELD(clip_gradient)
+        .set_default(-1.0f)
+        .describe(
+            "Clip gradient to the range of [-clip_gradient, clip_gradient] "
+            "If clip_gradient <= 0, gradient clipping is turned off. "
+            "grad = max(min(grad, clip_gradient), -clip_gradient).");
+    DMLC_DECLARE_FIELD(l2_regularization_strength)
+        .set_default(0.0f)
+        .describe("Lambda term for group lasso objective.");
+    DMLC_DECLARE_FIELD(epsilon).set_default(1.0e-5).describe(
+        "Epsilon for numerical stability");
+    DMLC_DECLARE_FIELD(current_update)
+        .set_default(0.0f)
+        .describe("Current update iteration for lazy update with group lasso "
+                  "objective.");
+  }
+};
+
+inline bool ProximalGroupAdagradStorageType(const nnvm::NodeAttrs &attrs,
+                                            const int dev_mask,
+                                            DispatchMode *dispatch_mode,
+                                            std::vector<int> *in_attrs,
+                                            std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 4U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const int weight_stype = in_attrs->at(0);
+  const int grad_stype = in_attrs->at(1);
+  const int state_stype = in_attrs->at(2);
+  const int counter_stype = in_attrs->at(3);
+  bool dispatched = false;
+  if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
+    // dns, ... -> dns
+    dispatched = storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode,
+                                     DispatchMode::kFCompute);
+  }
+  if (!dispatched && grad_stype == kRowSparseStorage &&
+      counter_stype == kDefaultStorage &&
+      (weight_stype == kRowSparseStorage || weight_stype == kDefaultStorage) &&
+      state_stype == weight_stype) {
+    // weight and state share stype, grad's stype = rsp
+    dispatched = storage_type_assign(
+        out_attrs, static_cast<NDArrayStorageType>(weight_stype), dispatch_mode,
+        DispatchMode::kFComputeEx);
+  }
+  return dispatched;
+}
+
+/*! \brief kernel for sparse adagrad update with group sparsity regularization
+ */
+template <typename xpu> struct ProximalGroupAdagradDnsRspKernel {
+  template <typename DType, typename IType>
+  MSHADOW_XINLINE static void
+  Map(int i, const index_t row_length, DType *out_data, DType *state_data,
+      DType *weight_data, const IType *grad_idx, const DType *grad_data,
+      DType *last_update_data, const DType current_update,
+      const DType clip_gradient, const DType rescale_grad,
+      const DType l2_regularization_strength, const DType lr, const DType eps) {
+    using namespace mshadow_op;
+
+    // Helper to obtain index into weight / state arrays
+    auto get_data_j = [&i, &grad_idx, &row_length](index_t j) -> index_t {
+      return grad_idx[i] * row_length + j;
+    };
+    // Helper to obtain explicit rescaled and clipped grad
+    auto get_grad_rescaled = [&i, &row_length, &grad_data, &rescale_grad,
+                              &clip_gradient](index_t j) -> DType {
+      index_t grad_j = i * row_length + j;
+      DType grad_rescaled = grad_data[grad_j] * rescale_grad;
+      if (clip_gradient >= 0.0f) {
+        grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
+      }
+      return grad_rescaled;
+    };
+
+    // Update history states
+    DType grad_ssq = 0;
+    for (index_t j = 0; j < row_length; j++) {
+      const DType grad_rescaled = get_grad_rescaled(j);
+      grad_ssq += grad_rescaled * grad_rescaled;
+    }
+    state_data[grad_idx[i]] += grad_ssq / row_length;
+
+    // Number of weight updates skipped due to lazy_update
+    DType delay{0};
+    if (l2_regularization_strength > 0) {
+      // last_update_data[grad_idx[i]] is only valid if
+      // l2_regularization_strength > 0. Otherwise may be out of bounds read.
+      delay = current_update - last_update_data[grad_idx[i]];
+      last_update_data[grad_idx[i]] = current_update;
+    }
+
+    if (l2_regularization_strength <= 0 || delay < 0) {
+      if (delay < 0) {
+        std::printf("Got invalid last_update in proximal_adagrad_update. "
+                    "Using standard Adagrad update.\n");
+      }
+
+      // Standard Adagrad Update
+      for (index_t j = 0; j < row_length; j++) {
+        // clang-format off
+        const DType grad_rescaled = get_grad_rescaled(j);
+        index_t data_j = get_data_j(j);
+        const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps);
+        out_data[data_j] = weight_data[data_j] - div;
+        // clang-format on
+      }
+    } else {
+      // Compute L2 norm of updated parameter using scaled sum of squares
+      DType norm, scale;
+      mshadow_op::nrm2::SetInitValue(norm, scale);
+      for (index_t j = 0; j < row_length; j++) {
+        const DType grad_rescaled = get_grad_rescaled(j);
+        index_t data_j = get_data_j(j);
+        const DType val =
+            (weight_data[data_j] -
+             lr / std::sqrt(state_data[grad_idx[i]] + eps) * grad_rescaled);
+        mshadow_op::nrm2::Reduce(norm, val, scale);
+      }
+      mshadow_op::nrm2::Finalize(norm, scale);
+
+      // Compute regularization lambda
+      DType lambda = l2_regularization_strength * lr /
+                     square_root::Map(state_data[grad_idx[i]] + eps);
+      DType l2_scale = 1 - lambda / norm;
+      if (l2_scale < 0) {
+        l2_scale = 0;
+      } else if (l2_scale > 0) {
+        scale = math::pow(scale, delay);
+      }
+
+      if (l2_scale == 0) {
+        // Soft threshold weights (proximal map for group lasso)
+        for (index_t j = 0; j < row_length; j++) {
+          index_t data_j = get_data_j(j);
+          out_data[data_j] = 0;
+        }
+      } else {
+        for (index_t j = 0; j < row_length; j++) {
+          // clang-format off
+          const DType grad_rescaled = get_grad_rescaled(j);
+          index_t data_j = get_data_j(j);
+          const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps);
+          out_data[data_j] = (weight_data[data_j] - div) * l2_scale;
+          // clang-format on
+        }
+      }
+    }
+  }
+};
+
+/*
+ * \brief Proximal Group Adagrad update implementation for dense weight and row_sparse grad.
+ */
+template <typename xpu>
+inline void ProximalGroupAdagradUpdateDnsRspDnsImpl(
+    const ProximalGroupAdagradParam &param, const OpContext &ctx,
+    const TBlob &weight, const NDArray &grad, const TBlob &state,
+    const TBlob &last_update, const OpReqType &req, TBlob *out) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mshadow_op;
+  using namespace mxnet_op;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(grad.storage_type(), kRowSparseStorage);
+  // if gradients are zeros, no weights are updated
+  if (req == kNullOp) {
+    return;
+  }
+  CHECK_EQ(req, kWriteInplace)
+      << "kWriteInplace is expected for sparse proximal_adagrad_update";
+  CHECK_GT(weight.shape_.Size(), 0);
+  CHECK_GT(state.shape_.Size(), 0);
+
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, {
+      DType *weight_data = weight.dptr<DType>();
+      DType *out_data = out->dptr<DType>();
+      const IType *grad_idx = grad.aux_data(rowsparse::kIdx).dptr<IType>();
+      const DType *grad_val = grad.data().dptr<DType>();
+      DType *state_data = state.dptr<DType>();
+      DType *last_update_data = last_update.dptr<DType>();
+      const nnvm::dim_t num_grad = grad.aux_shape(rowsparse::kIdx)[0];
+      const auto row_length = weight.shape_.ProdShape(1, weight.ndim());
+
+      if (!grad.storage_initialized()) {
+        // Lazy update with 0 gradient
+        return;
+      }
+
+      Kernel<ProximalGroupAdagradDnsRspKernel<xpu>, xpu>::Launch(
+          s, num_grad, row_length, out_data, state_data, weight_data, grad_idx,
+          grad_val, last_update_data, static_cast<DType>(param.current_update),
+          static_cast<DType>(param.clip_gradient),
+          static_cast<DType>(param.rescale_grad),
+          static_cast<DType>(param.l2_regularization_strength),
+          static_cast<DType>(param.lr), static_cast<DType>(param.epsilon));
+    });
+  });
+}
+
+/*
+ * \brief Proximal adagrad update implementation for row_sparse grad.
+ *        Both standard update and lazy update are supported.
+ */
+template <typename xpu>
+inline void ProximalGroupAdagradUpdateRspRspRspImpl(
+    const ProximalGroupAdagradParam &param, const OpContext &ctx,
+    const NDArray &weight, const NDArray &grad, const NDArray &state,
+    const NDArray &last_update_buffer, const OpReqType &req, NDArray *out) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  CheckAllRowsPresent(weight, "ProximalGroupAdagradUpdate", "weights");
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  // fill history with zero values
+  if (!state.storage_initialized()) {
+    NDArray state_zeros = state;
+    FillDnsZerosRspImpl(s, &state_zeros);
+  } else {
+    CheckAllRowsPresent(state, "ProximalGroupAdagradUpdate", "states");
+  }
+  // reuse dns rsp implementation when storage_shape == shape
+  TBlob out_blob = out->data();
+  ProximalGroupAdagradUpdateDnsRspDnsImpl<xpu>(
+      param, ctx, weight.data(), grad, state.data(), last_update_buffer.data(),
+      req, &out_blob);
+}
+
+template <typename xpu>
+inline void ProximalGroupAdagradUpdateEx(const nnvm::NodeAttrs &attrs,
+                                         const OpContext &ctx,
+                                         const std::vector<NDArray> &inputs,
+                                         const std::vector<OpReqType> &req,
+                                         const std::vector<NDArray> &outputs) {
+  const ProximalGroupAdagradParam &param =
+      nnvm::get<ProximalGroupAdagradParam>(attrs.parsed);
+  const auto weight_stype = inputs[0].storage_type();
+  const auto grad_stype = inputs[1].storage_type();
+  const auto state_stype = inputs[2].storage_type();
+  const auto counter_stype = inputs[3].storage_type();
+  const auto output_stype = outputs[0].storage_type();
+
+  if (state_stype == weight_stype && output_stype == weight_stype &&
+      weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage &&
+      counter_stype == kDefaultStorage) {
+    NDArray out = outputs[0];
+    ProximalGroupAdagradUpdateRspRspRspImpl<xpu>(
+        param, ctx, inputs[0], inputs[1], inputs[2], inputs[3], req[0], &out);
+  } else if (state_stype == weight_stype && output_stype == weight_stype &&
+             weight_stype == kDefaultStorage &&
+             grad_stype == kRowSparseStorage &&
+             counter_stype == kDefaultStorage) {
+    TBlob out_blob = outputs[0].data();
+    ProximalGroupAdagradUpdateDnsRspDnsImpl<xpu>(
+        param, ctx, inputs[0].data(), inputs[1], inputs[2].data(),
+        inputs[3].data(), req[0], &out_blob);
+  } else {
+    LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_OPTIMIZER_OP_INL_H_
diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc
new file mode 100644
index 000000000000..278ec62eab63
--- /dev/null
+++ b/src/operator/contrib/optimizer_op.cc
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file optimizer_op.cc
+ * \brief Optimizer operators
+ * \author Leonard Lausen
+ */
+#include "./optimizer_op-inl.h"
+#include "../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(ProximalGroupAdagradParam);
+
+/*!
+ * \brief Shape inference function for Proximal Group AdaGrad.
+ */
+inline bool ProximalGroupAdagradShape(const nnvm::NodeAttrs &attrs,
+                                      std::vector<TShape> *in_attrs,
+                                      std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 4U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 1, out_attrs->at(0));
+
+  return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U &&
+         (in_attrs->at(0)[0] == in_attrs->at(1)[0]) &&
+         (in_attrs->at(0)[0] == in_attrs->at(2)[0]);
+}
+
+NNVM_REGISTER_OP(_contrib_proximal_group_adagrad_update)
+.describe(R"code(Update function for Proximal Group AdaGrad optimizer.
+
+Referenced from *Adaptive Subgradient Methods for Online Learning and Stochastic Optimization*,
+and available at http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf but
+uses only a single learning rate for every row of the parameter array.
+
+Updates are applied by::
+
+    grad = clip(grad * rescale_grad, clip_gradient)
+    history += mean(square(grad), axis=1, keepdims=True)
+    div = grad / sqrt(history + float_stable_eps)
+    weight -= div * lr
+
+If `l2_regularization_strength > 0` a proximal operator is used to optimize with
+group lasso objective. Weights are updated lazily if the gradient is sparse.
+In particular, before using a set of weights for a forward pass, you may
+want to ensure that the lazily accumulated group lasso regularization is
+applied.
+
+Note that non-zero values for the weight decay option are not supported.
+
+)code" ADD_FILELINE)
+.set_num_inputs(4)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ProximalGroupAdagradParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ProximalGroupAdagradShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<4, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", ProximalGroupAdagradStorageType)
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    return std::vector<uint32_t>{2, 3};
+  })
+.set_attr<FComputeEx>("FComputeEx<cpu>", ProximalGroupAdagradUpdateEx<cpu>)
+.add_argument("weight", "NDArray-or-Symbol", "Weight")
+.add_argument("grad", "NDArray-or-Symbol", "Gradient")
+.add_argument("history", "NDArray-or-Symbol", "History")
+.add_argument("last_update", "NDArray-or-Symbol", "Array storing last update counter for each row.")
+.add_arguments(ProximalGroupAdagradParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/optimizer_op.cu b/src/operator/contrib/optimizer_op.cu
new file mode 100644
index 000000000000..49221e17c42c
--- /dev/null
+++ b/src/operator/contrib/optimizer_op.cu
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file optimizer_op.cu
+ * \brief Optimizer operators
+ * \author Leonard Lausen
+ */
+#include "./optimizer_op-inl.h"
+#include <cub/cub.cuh>
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_contrib_proximal_group_adagrad_update)
+.set_attr<FComputeEx>("FComputeEx<gpu>", ProximalGroupAdagradUpdateEx<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_contrib_optimizer.py b/tests/python/unittest/test_contrib_optimizer.py
new file mode 100644
index 000000000000..71a50d8dc065
--- /dev/null
+++ b/tests/python/unittest/test_contrib_optimizer.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import itertools
+
+import numpy as np
+
+import mxnet as mx
+from mxnet.test_utils import *
+
+
+# ProximalGroupAdaGrad
+class PyProximalGroupAdaGrad(mx.optimizer.Optimizer):
+    """The python reference of Proximal Group AdaGrad optimizer.
+
+    Parameters
+    ----------
+    l2_regularization_strength : float
+       Strength of group lasso L2 regularization.
+    eps: float, optional
+        Small value to avoid division by 0.
+
+    """
+
+    def __init__(self, l2_regularization_strength=0.0, eps=1e-5, **kwargs):
+        super(PyProximalGroupAdaGrad, self).__init__(**kwargs)
+        self.l2_regularization_strength = l2_regularization_strength
+        self.float_stable_eps = eps
+
+    def create_state(self, index, weight):
+        assert len(weight.shape) == 2
+        history = mx.nd.zeros(
+            (weight.shape[0], 1), weight.context, stype=weight.stype)
+        return history
+
+    def update(self, index, weight, grad, state):
+        self._update_count(index)
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
+        assert wd == 0
+
+        history = state
+        grad = grad * self.rescale_grad
+        if self.clip_gradient is not None:
+            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+        history[:] += mx.nd.mean(mx.nd.square(grad), axis=1, keepdims=True)
+        div = lr * grad / mx.nd.sqrt(history + self.float_stable_eps)
+
+        if self.l2_regularization_strength > 0:
+            scaled_l2 = lr / mx.nd.sqrt(history + self.float_stable_eps) \
+                * self.l2_regularization_strength
+            norm = mx.nd.norm(weight - div, ord=2, axis=1, keepdims=True)
+            weight[:] = (weight - div) * \
+                (1 - scaled_l2 / norm)
+            weight[:] *= norm > scaled_l2
+        else:
+            weight[:] -= div
+
+
+def test_proximal_group_adagrad():
+    mx.random.seed(0)
+    opt1 = PyProximalGroupAdaGrad
+    opt2 = mx.optimizer.contrib.ProximalGroupAdaGrad
+    shape = (3, 4)
+    eps_options = [{}, {'eps': 1e-8}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    l2_options = [{
+        'l2_regularization_strength': 0.0
+    }, {
+        'l2_regularization_strength': 0.05
+    }]
+    for dtype in [np.float32]:
+        for options in itertools.product(eps_options, cg_options, rg_options,
+                                         l2_options):
+            kwarg = dict(wd=0.0)
+            for option in options:
+                kwarg.update(option)
+            compare_optimizer(
+                opt1(**kwarg),
+                opt2(**kwarg),
+                shape,
+                dtype,
+                compare_states=False)
+            if kwarg.get('l2_regularization_strength', 0.0) == 0.0:
+                # By design results for PyOp which always performs
+                # dense update will differ if
+                # l2_regularization_strength > 0
+                compare_optimizer(
+                    opt1(**kwarg),
+                    opt2(**kwarg),
+                    shape,
+                    dtype,
+                    w_stype='row_sparse',
+                    g_stype='row_sparse',
+                    compare_states=False)
+                compare_optimizer(
+                    opt1(**kwarg),
+                    opt2(**kwarg),
+                    shape,
+                    dtype,
+                    g_stype='row_sparse',
+                    compare_states=False)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 496a61f356b3..334b7d4c0fdb 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -71,43 +71,6 @@ def test_lr_wd_mult():
     assert not mx.test_utils.almost_equal(args1['fc1_bias'], args2['fc1_bias'], 1e-1)
     assert not mx.test_utils.almost_equal(args1['fc2_weight'], args2['fc2_weight'], 1e-1)
 
-def compare_ndarray_tuple(t1, t2, rtol=None, atol=None):
-    if t1 is not None and t2 is not None:
-        if isinstance(t1, tuple):
-            for s1, s2 in zip(t1, t2):
-                compare_ndarray_tuple(s1, s2, rtol, atol)
-        else:
-            assert_almost_equal(t1.asnumpy(), t2.asnumpy(), rtol=rtol, atol=atol)
-
-
-def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default',
-                      rtol=1e-4, atol=1e-5):
-    if w_stype == 'default':
-        w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-        w1 = w2.copyto(default_context())
-    elif w_stype == 'row_sparse' or w_stype == 'csr':
-        w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
-        w1 = w2.copyto(default_context()).tostype('default')
-    else:
-        raise Exception("type not supported yet")
-    if g_stype == 'default':
-        g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-        g1 = g2.copyto(default_context())
-    elif g_stype == 'row_sparse' or g_stype == 'csr':
-        g2 = rand_ndarray(shape, g_stype, dtype=dtype)
-        g1 = g2.copyto(default_context()).tostype('default')
-    else:
-        raise Exception("type not supported yet")
-
-    state1 = opt1.create_state_multi_precision(0, w1)
-    state2 = opt2.create_state_multi_precision(0, w2)
-    compare_ndarray_tuple(state1, state2)
-
-    opt1.update_multi_precision(0, w1, g1, state1)
-    opt2.update_multi_precision(0, w2, g2, state2)
-    compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol)
-    assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=rtol, atol=atol)
-
 # SGD
 
 class PySGD(mx.optimizer.Optimizer):

From be20af222228037c4fd3e4627694fe4389a42707 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <leonard@lausen.nl>
Date: Mon, 1 Oct 2018 11:30:05 +0000
Subject: [PATCH 2/4] Remove proximal implementation and rename to GroupAdagrad

---
 docs/api/python/optimization/contrib.md       |   2 +-
 python/mxnet/optimizer/__init__.py            |   1 +
 python/mxnet/optimizer/contrib.py             |  69 ++-----
 src/operator/contrib/optimizer_op-inl.h       | 182 +++++-------------
 src/operator/contrib/optimizer_op.cc          |  42 ++--
 src/operator/contrib/optimizer_op.cu          |   4 +-
 .../python/unittest/test_contrib_optimizer.py |  72 +++----
 7 files changed, 120 insertions(+), 252 deletions(-)

diff --git a/docs/api/python/optimization/contrib.md b/docs/api/python/optimization/contrib.md
index 9d3f3483113e..8fc261f4f052 100644
--- a/docs/api/python/optimization/contrib.md
+++ b/docs/api/python/optimization/contrib.md
@@ -35,7 +35,7 @@ In the rest of this document, we list routines provided by the `optimizer.contri
 .. autosummary::
     :nosignatures:
 
-     ProximalGroupAdaGrad
+     GroupAdaGrad
 ```
 
 ## API Reference
diff --git a/python/mxnet/optimizer/__init__.py b/python/mxnet/optimizer/__init__.py
index 4840413ccaa6..72eb5a741520 100644
--- a/python/mxnet/optimizer/__init__.py
+++ b/python/mxnet/optimizer/__init__.py
@@ -17,6 +17,7 @@
 """Optimizer API of MXNet."""
 
 from . import optimizer, contrib
+# pylint: disable=wildcard-import
 from .optimizer import *
 # pylint: enable=wildcard-import
 
diff --git a/python/mxnet/optimizer/contrib.py b/python/mxnet/optimizer/contrib.py
index 8cf48261036e..1baf2ff1020a 100644
--- a/python/mxnet/optimizer/contrib.py
+++ b/python/mxnet/optimizer/contrib.py
@@ -18,19 +18,18 @@
 
 # pylint: disable=too-many-lines
 """Contrib optimizers."""
-from ..ndarray import (NDArray, clip, contrib, full, mean, norm, sparse, sqrt,
-                       square, zeros)
+from ..ndarray import (NDArray, clip, contrib, mean, sqrt, square, zeros)
 from .optimizer import Optimizer
 
 # convenience wrapper for Optimizer.Register
 register = Optimizer.register  # pylint: disable=invalid-name
 
-__all__ = ['ProximalGroupAdaGrad']
+__all__ = ['GroupAdaGrad']
 
 
 @register
-class ProximalGroupAdaGrad(Optimizer):
-    """Proximal Adagrad optimizer with row-wise learning rates.
+class GroupAdaGrad(Optimizer):
+    """Adagrad optimizer with row-wise learning rates.
 
     This class implements the AdaGrad optimizer described in *Adaptive
     Subgradient Methods for Online Learning and Stochastic Optimization*, and
@@ -44,12 +43,11 @@ class ProximalGroupAdaGrad(Optimizer):
         div = grad / sqrt(history + float_stable_eps)
         weight -= div * lr
 
-    If `l2_regularization_strength > 0` a proximal operator is used to optimize
-    with group lasso objective. Weights are updated lazily if the gradient is
-    sparse. In particular, before using a set of weights for a forward pass,
-    you may want to ensure that the lazily accumulated group lasso
-    regularization is applied. This can be achieved by creating a sparse
-    gradient array that contains explicit 0 data for the indices to be updated:
+    Weights are updated lazily if the gradient is sparse. In particular, before
+    using a set of weights for a forward pass, you may want to ensure that the
+    lazily accumulated group lasso regularization is applied. This can be
+    achieved by creating a sparse gradient array that contains explicit 0 data
+    for the indices to be updated:
 
         fake_grad = mx.nd.sparse.row_sparse_array(
             (mx.nd.zeros((len(indices), dim)), indices))
@@ -60,38 +58,27 @@ class ProximalGroupAdaGrad(Optimizer):
         trainer.step(batch_size=1)
 
     For details of the update algorithm see
-    :class:`~mxnet.ndarray.contrib.proximal_group_adagrad_update`.
+    :class:`~mxnet.ndarray.contrib.group_adagrad_update`.
 
     This optimizer accepts the following parameters in addition to those
     accepted by :class:`.Optimizer`. Weight decay is not supported.
 
     Parameters
     ----------
-    l2_regularization_strength : float
-       Strength of group lasso L2 regularization.
     eps: float, optional
         Initial value of the history accumulator. Avoids division by 0.
 
     """
 
-    def __init__(self, l2_regularization_strength=0.0, eps=1e-5, **kwargs):
-        super(ProximalGroupAdaGrad, self).__init__(**kwargs)
-        self.l2_regularization_strength = l2_regularization_strength
+    def __init__(self, eps=1e-5, **kwargs):
+        super(GroupAdaGrad, self).__init__(**kwargs)
         self.float_stable_eps = eps
 
     def create_state(self, index, weight):
         assert len(weight.shape) == 2
         history = zeros(
             (weight.shape[0], 1), weight.context, stype=weight.stype)
-        last_update = None
-        if self.l2_regularization_strength > 0:
-            last_update = full(
-                shape=(weight.shape[0], ),
-                val=self.num_update,
-                ctx=weight.context)
-        else:
-            last_update = zeros(1, ctx=weight.context)
-        return (history, last_update)
+        return history
 
     def update(self, index, weight, grad, state):
         assert (isinstance(weight, NDArray))
@@ -99,11 +86,9 @@ def update(self, index, weight, grad, state):
         self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
-        assert wd == 0, 'Weight decay is not supported for ProximalGroupAdaGrad'
+        assert wd == 0, 'Weight decay is not supported for GroupAdaGrad'
 
         is_sparse = grad.stype == 'row_sparse'
-        history = state[0]
-        last_update = state[1]
         if is_sparse:
             kwargs = {
                 'epsilon': self.float_stable_eps,
@@ -111,35 +96,17 @@ def update(self, index, weight, grad, state):
             }
             if self.clip_gradient:
                 kwargs['clip_gradient'] = self.clip_gradient
-            if self.l2_regularization_strength:
-                kwargs['l2_regularization_strength'] = \
-                    self.l2_regularization_strength
-            contrib.proximal_group_adagrad_update(
+            contrib.group_adagrad_update(
                 weight,
                 grad,
-                history,
+                state,
                 out=weight,
-                last_update=last_update,
                 lr=lr,
-                current_update=self.num_update,
                 **kwargs)
-        elif self.l2_regularization_strength > 0:
-            grad = grad * self.rescale_grad
-            if self.clip_gradient is not None:
-                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-            history[:] += mean(square(grad), axis=1, keepdims=True)
-            div = lr * grad / sqrt(history + self.float_stable_eps)
-            num_skipped = (self.num_update - last_update).expand_dims(1)
-            scaled_l2 = lr / sqrt(history + self.float_stable_eps) \
-                * self.l2_regularization_strength * num_skipped
-            nrm = norm(weight - div, ord=2, axis=1, keepdims=True)
-            weight[:] = (weight - div) * (1 - scaled_l2 / nrm)
-            weight[:] *= nrm > scaled_l2
-            last_update[:] = self.num_update
         else:
             grad = grad * self.rescale_grad
             if self.clip_gradient is not None:
                 grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-            history[:] += mean(square(grad), axis=1, keepdims=True)
-            div = lr * grad / sqrt(history + self.float_stable_eps)
+            state[:] += mean(square(grad), axis=1, keepdims=True)
+            div = lr * grad / sqrt(state + self.float_stable_eps)
             weight[:] -= div
diff --git a/src/operator/contrib/optimizer_op-inl.h b/src/operator/contrib/optimizer_op-inl.h
index 0bbe9cf7d1f4..fd556a4231cb 100644
--- a/src/operator/contrib/optimizer_op-inl.h
+++ b/src/operator/contrib/optimizer_op-inl.h
@@ -43,15 +43,12 @@
 namespace mxnet {
 namespace op {
 
-struct ProximalGroupAdagradParam
-    : public dmlc::Parameter<ProximalGroupAdagradParam> {
+struct GroupAdagradParam : public dmlc::Parameter<GroupAdagradParam> {
   float lr;
   float epsilon;
   float rescale_grad;
   float clip_gradient;
-  float l2_regularization_strength;
-  float current_update;
-  DMLC_DECLARE_PARAMETER(ProximalGroupAdagradParam) {
+  DMLC_DECLARE_PARAMETER(GroupAdagradParam) {
     DMLC_DECLARE_FIELD(lr).describe("Learning rate");
     DMLC_DECLARE_FIELD(rescale_grad)
         .set_default(1.0f)
@@ -62,29 +59,21 @@ struct ProximalGroupAdagradParam
             "Clip gradient to the range of [-clip_gradient, clip_gradient] "
             "If clip_gradient <= 0, gradient clipping is turned off. "
             "grad = max(min(grad, clip_gradient), -clip_gradient).");
-    DMLC_DECLARE_FIELD(l2_regularization_strength)
-        .set_default(0.0f)
-        .describe("Lambda term for group lasso objective.");
     DMLC_DECLARE_FIELD(epsilon).set_default(1.0e-5).describe(
         "Epsilon for numerical stability");
-    DMLC_DECLARE_FIELD(current_update)
-        .set_default(0.0f)
-        .describe("Current update iteration for lazy update with group lasso "
-                  "objective.");
   }
 };
 
-inline bool ProximalGroupAdagradStorageType(const nnvm::NodeAttrs &attrs,
-                                            const int dev_mask,
-                                            DispatchMode *dispatch_mode,
-                                            std::vector<int> *in_attrs,
-                                            std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 4U);
+inline bool GroupAdagradStorageType(const nnvm::NodeAttrs &attrs,
+                                    const int dev_mask,
+                                    DispatchMode *dispatch_mode,
+                                    std::vector<int> *in_attrs,
+                                    std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 1U);
   const int weight_stype = in_attrs->at(0);
   const int grad_stype = in_attrs->at(1);
   const int state_stype = in_attrs->at(2);
-  const int counter_stype = in_attrs->at(3);
   bool dispatched = false;
   if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
     // dns, ... -> dns
@@ -92,7 +81,6 @@ inline bool ProximalGroupAdagradStorageType(const nnvm::NodeAttrs &attrs,
                                      DispatchMode::kFCompute);
   }
   if (!dispatched && grad_stype == kRowSparseStorage &&
-      counter_stype == kDefaultStorage &&
       (weight_stype == kRowSparseStorage || weight_stype == kDefaultStorage) &&
       state_stype == weight_stype) {
     // weight and state share stype, grad's stype = rsp
@@ -105,14 +93,13 @@ inline bool ProximalGroupAdagradStorageType(const nnvm::NodeAttrs &attrs,
 
 /*! \brief kernel for sparse adagrad update with group sparsity regularization
  */
-template <typename xpu> struct ProximalGroupAdagradDnsRspKernel {
+template <typename xpu> struct GroupAdagradDnsRspKernel {
   template <typename DType, typename IType>
   MSHADOW_XINLINE static void
   Map(int i, const index_t row_length, DType *out_data, DType *state_data,
       DType *weight_data, const IType *grad_idx, const DType *grad_data,
-      DType *last_update_data, const DType current_update,
-      const DType clip_gradient, const DType rescale_grad,
-      const DType l2_regularization_strength, const DType lr, const DType eps) {
+      const DType clip_gradient, const DType rescale_grad, const DType lr,
+      const DType eps) {
     using namespace mshadow_op;
 
     // Helper to obtain index into weight / state arrays
@@ -138,82 +125,26 @@ template <typename xpu> struct ProximalGroupAdagradDnsRspKernel {
     }
     state_data[grad_idx[i]] += grad_ssq / row_length;
 
-    // Number of weight updates skipped due to lazy_update
-    DType delay{0};
-    if (l2_regularization_strength > 0) {
-      // last_update_data[grad_idx[i]] is only valid if
-      // l2_regularization_strength > 0. Otherwise may be out of bounds read.
-      delay = current_update - last_update_data[grad_idx[i]];
-      last_update_data[grad_idx[i]] = current_update;
-    }
-
-    if (l2_regularization_strength <= 0 || delay < 0) {
-      if (delay < 0) {
-        std::printf("Got invalid last_update in proximal_adagrad_update. "
-                    "Using standard Adagrad update.\n");
-      }
-
-      // Standard Adagrad Update
-      for (index_t j = 0; j < row_length; j++) {
-        // clang-format off
-        const DType grad_rescaled = get_grad_rescaled(j);
-        index_t data_j = get_data_j(j);
-        const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps);
-        out_data[data_j] = weight_data[data_j] - div;
-        // clang-format on
-      }
-    } else {
-      // Compute L2 norm of updated parameter using scaled sum of squares
-      DType norm, scale;
-      mshadow_op::nrm2::SetInitValue(norm, scale);
-      for (index_t j = 0; j < row_length; j++) {
-        const DType grad_rescaled = get_grad_rescaled(j);
-        index_t data_j = get_data_j(j);
-        const DType val =
-            (weight_data[data_j] -
-             lr / std::sqrt(state_data[grad_idx[i]] + eps) * grad_rescaled);
-        mshadow_op::nrm2::Reduce(norm, val, scale);
-      }
-      mshadow_op::nrm2::Finalize(norm, scale);
-
-      // Compute regularization lambda
-      DType lambda = l2_regularization_strength * lr /
-                     square_root::Map(state_data[grad_idx[i]] + eps);
-      DType l2_scale = 1 - lambda / norm;
-      if (l2_scale < 0) {
-        l2_scale = 0;
-      } else if (l2_scale > 0) {
-        scale = math::pow(scale, delay);
-      }
-
-      if (l2_scale == 0) {
-        // Soft threshold weights (proximal map for group lasso)
-        for (index_t j = 0; j < row_length; j++) {
-          index_t data_j = get_data_j(j);
-          out_data[data_j] = 0;
-        }
-      } else {
-        for (index_t j = 0; j < row_length; j++) {
-          // clang-format off
-          const DType grad_rescaled = get_grad_rescaled(j);
-          index_t data_j = get_data_j(j);
-          const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps);
-          out_data[data_j] = (weight_data[data_j] - div) * l2_scale;
-          // clang-format on
-        }
-      }
+    // Standard Adagrad Update
+    for (index_t j = 0; j < row_length; j++) {
+      // clang-format off
+      const DType grad_rescaled = get_grad_rescaled(j);
+      index_t data_j = get_data_j(j);
+      const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps);
+      out_data[data_j] = weight_data[data_j] - div;
+      // clang-format on
     }
   }
 };
 
 /*
- * \brief Proximal Group Adagrad update implementation for dense weight and row_sparse grad.
+ * \brief Group Adagrad update implementation for dense weight and row_sparse
+ * grad.
  */
 template <typename xpu>
-inline void ProximalGroupAdagradUpdateDnsRspDnsImpl(
-    const ProximalGroupAdagradParam &param, const OpContext &ctx,
-    const TBlob &weight, const NDArray &grad, const TBlob &state,
-    const TBlob &last_update, const OpReqType &req, TBlob *out) {
+inline void GroupAdagradUpdateDnsRspDnsImpl(
+    const GroupAdagradParam &param, const OpContext &ctx, const TBlob &weight,
+    const NDArray &grad, const TBlob &state, const OpReqType &req, TBlob *out) {
   using namespace mshadow;
   using namespace mshadow::expr;
   using namespace mshadow_op;
@@ -225,7 +156,7 @@ inline void ProximalGroupAdagradUpdateDnsRspDnsImpl(
     return;
   }
   CHECK_EQ(req, kWriteInplace)
-      << "kWriteInplace is expected for sparse proximal_adagrad_update";
+      << "kWriteInplace is expected for sparse group_adagrad_update";
   CHECK_GT(weight.shape_.Size(), 0);
   CHECK_GT(state.shape_.Size(), 0);
 
@@ -236,7 +167,6 @@ inline void ProximalGroupAdagradUpdateDnsRspDnsImpl(
       const IType *grad_idx = grad.aux_data(rowsparse::kIdx).dptr<IType>();
       const DType *grad_val = grad.data().dptr<DType>();
       DType *state_data = state.dptr<DType>();
-      DType *last_update_data = last_update.dptr<DType>();
       const nnvm::dim_t num_grad = grad.aux_shape(rowsparse::kIdx)[0];
       const auto row_length = weight.shape_.ProdShape(1, weight.ndim());
 
@@ -245,73 +175,67 @@ inline void ProximalGroupAdagradUpdateDnsRspDnsImpl(
         return;
       }
 
-      Kernel<ProximalGroupAdagradDnsRspKernel<xpu>, xpu>::Launch(
+      Kernel<GroupAdagradDnsRspKernel<xpu>, xpu>::Launch(
           s, num_grad, row_length, out_data, state_data, weight_data, grad_idx,
-          grad_val, last_update_data, static_cast<DType>(param.current_update),
-          static_cast<DType>(param.clip_gradient),
-          static_cast<DType>(param.rescale_grad),
-          static_cast<DType>(param.l2_regularization_strength),
-          static_cast<DType>(param.lr), static_cast<DType>(param.epsilon));
+          grad_val, static_cast<DType>(param.clip_gradient),
+          static_cast<DType>(param.rescale_grad), static_cast<DType>(param.lr),
+          static_cast<DType>(param.epsilon));
     });
   });
 }
 
 /*
- * \brief Proximal adagrad update implementation for row_sparse grad.
- *        Both standard update and lazy update are supported.
+ * \brief AdaGrad update implementation for row_sparse grad. Both standard
+ *        update and lazy update are supported.
  */
 template <typename xpu>
-inline void ProximalGroupAdagradUpdateRspRspRspImpl(
-    const ProximalGroupAdagradParam &param, const OpContext &ctx,
-    const NDArray &weight, const NDArray &grad, const NDArray &state,
-    const NDArray &last_update_buffer, const OpReqType &req, NDArray *out) {
+inline void
+GroupAdagradUpdateRspRspRspImpl(const GroupAdagradParam &param,
+                                const OpContext &ctx, const NDArray &weight,
+                                const NDArray &grad, const NDArray &state,
+                                const OpReqType &req, NDArray *out) {
   using namespace mshadow;
   using namespace mxnet_op;
   using namespace rowsparse;
-  CheckAllRowsPresent(weight, "ProximalGroupAdagradUpdate", "weights");
+  CheckAllRowsPresent(weight, "GroupAdagradUpdate", "weights");
   Stream<xpu> *s = ctx.get_stream<xpu>();
   // fill history with zero values
   if (!state.storage_initialized()) {
     NDArray state_zeros = state;
     FillDnsZerosRspImpl(s, &state_zeros);
   } else {
-    CheckAllRowsPresent(state, "ProximalGroupAdagradUpdate", "states");
+    CheckAllRowsPresent(state, "GroupAdagradUpdate", "states");
   }
   // reuse dns rsp implementation when storage_shape == shape
   TBlob out_blob = out->data();
-  ProximalGroupAdagradUpdateDnsRspDnsImpl<xpu>(
-      param, ctx, weight.data(), grad, state.data(), last_update_buffer.data(),
-      req, &out_blob);
+  GroupAdagradUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad,
+                                       state.data(), req, &out_blob);
 }
 
 template <typename xpu>
-inline void ProximalGroupAdagradUpdateEx(const nnvm::NodeAttrs &attrs,
-                                         const OpContext &ctx,
-                                         const std::vector<NDArray> &inputs,
-                                         const std::vector<OpReqType> &req,
-                                         const std::vector<NDArray> &outputs) {
-  const ProximalGroupAdagradParam &param =
-      nnvm::get<ProximalGroupAdagradParam>(attrs.parsed);
+inline void GroupAdagradUpdateEx(const nnvm::NodeAttrs &attrs,
+                                 const OpContext &ctx,
+                                 const std::vector<NDArray> &inputs,
+                                 const std::vector<OpReqType> &req,
+                                 const std::vector<NDArray> &outputs) {
+  const GroupAdagradParam &param = nnvm::get<GroupAdagradParam>(attrs.parsed);
   const auto weight_stype = inputs[0].storage_type();
   const auto grad_stype = inputs[1].storage_type();
   const auto state_stype = inputs[2].storage_type();
-  const auto counter_stype = inputs[3].storage_type();
   const auto output_stype = outputs[0].storage_type();
 
   if (state_stype == weight_stype && output_stype == weight_stype &&
-      weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage &&
-      counter_stype == kDefaultStorage) {
+      weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage) {
     NDArray out = outputs[0];
-    ProximalGroupAdagradUpdateRspRspRspImpl<xpu>(
-        param, ctx, inputs[0], inputs[1], inputs[2], inputs[3], req[0], &out);
+    GroupAdagradUpdateRspRspRspImpl<xpu>(param, ctx, inputs[0], inputs[1],
+                                         inputs[2], req[0], &out);
   } else if (state_stype == weight_stype && output_stype == weight_stype &&
              weight_stype == kDefaultStorage &&
-             grad_stype == kRowSparseStorage &&
-             counter_stype == kDefaultStorage) {
+             grad_stype == kRowSparseStorage) {
     TBlob out_blob = outputs[0].data();
-    ProximalGroupAdagradUpdateDnsRspDnsImpl<xpu>(
-        param, ctx, inputs[0].data(), inputs[1], inputs[2].data(),
-        inputs[3].data(), req[0], &out_blob);
+    GroupAdagradUpdateDnsRspDnsImpl<xpu>(param, ctx, inputs[0].data(),
+                                         inputs[1], inputs[2].data(), req[0],
+                                         &out_blob);
   } else {
     LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
   }
diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc
index 278ec62eab63..3abc70d6fdf3 100644
--- a/src/operator/contrib/optimizer_op.cc
+++ b/src/operator/contrib/optimizer_op.cc
@@ -23,21 +23,21 @@
  * \brief Optimizer operators
  * \author Leonard Lausen
  */
-#include "./optimizer_op-inl.h"
 #include "../elemwise_op_common.h"
+#include "./optimizer_op-inl.h"
 
 namespace mxnet {
 namespace op {
 
-DMLC_REGISTER_PARAMETER(ProximalGroupAdagradParam);
+DMLC_REGISTER_PARAMETER(GroupAdagradParam);
 
 /*!
- * \brief Shape inference function for Proximal Group AdaGrad.
+ * \brief Shape inference function for Group AdaGrad.
  */
-inline bool ProximalGroupAdagradShape(const nnvm::NodeAttrs &attrs,
-                                      std::vector<TShape> *in_attrs,
-                                      std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 4U);
+inline bool GroupAdagradShape(const nnvm::NodeAttrs &attrs,
+                              std::vector<TShape> *in_attrs,
+                              std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 1U);
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
@@ -50,8 +50,8 @@ inline bool ProximalGroupAdagradShape(const nnvm::NodeAttrs &attrs,
          (in_attrs->at(0)[0] == in_attrs->at(2)[0]);
 }
 
-NNVM_REGISTER_OP(_contrib_proximal_group_adagrad_update)
-.describe(R"code(Update function for Proximal Group AdaGrad optimizer.
+NNVM_REGISTER_OP(_contrib_group_adagrad_update)
+.describe(R"code(Update function for Group AdaGrad optimizer.
 
 Referenced from *Adaptive Subgradient Methods for Online Learning and Stochastic Optimization*,
 and available at http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf but
@@ -64,31 +64,29 @@ Updates are applied by::
     div = grad / sqrt(history + float_stable_eps)
     weight -= div * lr
 
-If `l2_regularization_strength > 0` a proximal operator is used to optimize with
-group lasso objective. Weights are updated lazily if the gradient is sparse.
-In particular, before using a set of weights for a forward pass, you may
-want to ensure that the lazily accumulated group lasso regularization is
-applied.
+Weights are updated lazily if the gradient is sparse. In particular, before
+using a set of weights for a forward pass, you may want to ensure that the
+lazily accumulated group lasso regularization is applied.
 
 Note that non-zero values for the weight decay option are not supported.
 
 )code" ADD_FILELINE)
-.set_num_inputs(4)
+.set_num_inputs(3)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<ProximalGroupAdagradParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", ProximalGroupAdagradShape)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<4, 1>)
-.set_attr<FInferStorageType>("FInferStorageType", ProximalGroupAdagradStorageType)
+.set_attr_parser(ParamParser<GroupAdagradParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", GroupAdagradShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", GroupAdagradStorageType)
 .set_attr<nnvm::FMutateInputs>("FMutateInputs",
   [](const nnvm::NodeAttrs& attrs) {
-    return std::vector<uint32_t>{2, 3};
+    return std::vector<uint32_t>{2};
   })
-.set_attr<FComputeEx>("FComputeEx<cpu>", ProximalGroupAdagradUpdateEx<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", GroupAdagradUpdateEx<cpu>)
 .add_argument("weight", "NDArray-or-Symbol", "Weight")
 .add_argument("grad", "NDArray-or-Symbol", "Gradient")
 .add_argument("history", "NDArray-or-Symbol", "History")
 .add_argument("last_update", "NDArray-or-Symbol", "Array storing last update counter for each row.")
-.add_arguments(ProximalGroupAdagradParam::__FIELDS__());
+.add_arguments(GroupAdagradParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/optimizer_op.cu b/src/operator/contrib/optimizer_op.cu
index 49221e17c42c..40d99c5f0071 100644
--- a/src/operator/contrib/optimizer_op.cu
+++ b/src/operator/contrib/optimizer_op.cu
@@ -29,8 +29,8 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_contrib_proximal_group_adagrad_update)
-.set_attr<FComputeEx>("FComputeEx<gpu>", ProximalGroupAdagradUpdateEx<gpu>);
+NNVM_REGISTER_OP(_contrib_group_adagrad_update)
+.set_attr<FComputeEx>("FComputeEx<gpu>", GroupAdagradUpdateEx<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_contrib_optimizer.py b/tests/python/unittest/test_contrib_optimizer.py
index 71a50d8dc065..8ff8a7e1436b 100644
--- a/tests/python/unittest/test_contrib_optimizer.py
+++ b/tests/python/unittest/test_contrib_optimizer.py
@@ -23,22 +23,19 @@
 from mxnet.test_utils import *
 
 
-# ProximalGroupAdaGrad
-class PyProximalGroupAdaGrad(mx.optimizer.Optimizer):
-    """The python reference of Proximal Group AdaGrad optimizer.
+# * GroupAdaGrad
+class PyGroupAdaGrad(mx.optimizer.Optimizer):
+    """The python reference of Group AdaGrad optimizer.
 
     Parameters
     ----------
-    l2_regularization_strength : float
-       Strength of group lasso L2 regularization.
     eps: float, optional
         Small value to avoid division by 0.
 
     """
 
-    def __init__(self, l2_regularization_strength=0.0, eps=1e-5, **kwargs):
-        super(PyProximalGroupAdaGrad, self).__init__(**kwargs)
-        self.l2_regularization_strength = l2_regularization_strength
+    def __init__(self, eps=1e-5, **kwargs):
+        super(PyGroupAdaGrad, self).__init__(**kwargs)
         self.float_stable_eps = eps
 
     def create_state(self, index, weight):
@@ -59,34 +56,19 @@ def update(self, index, weight, grad, state):
             grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
         history[:] += mx.nd.mean(mx.nd.square(grad), axis=1, keepdims=True)
         div = lr * grad / mx.nd.sqrt(history + self.float_stable_eps)
+        weight[:] -= div
 
-        if self.l2_regularization_strength > 0:
-            scaled_l2 = lr / mx.nd.sqrt(history + self.float_stable_eps) \
-                * self.l2_regularization_strength
-            norm = mx.nd.norm(weight - div, ord=2, axis=1, keepdims=True)
-            weight[:] = (weight - div) * \
-                (1 - scaled_l2 / norm)
-            weight[:] *= norm > scaled_l2
-        else:
-            weight[:] -= div
 
-
-def test_proximal_group_adagrad():
+def test_group_adagrad():
     mx.random.seed(0)
-    opt1 = PyProximalGroupAdaGrad
-    opt2 = mx.optimizer.contrib.ProximalGroupAdaGrad
+    opt1 = PyGroupAdaGrad
+    opt2 = mx.optimizer.contrib.GroupAdaGrad
     shape = (3, 4)
     eps_options = [{}, {'eps': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    l2_options = [{
-        'l2_regularization_strength': 0.0
-    }, {
-        'l2_regularization_strength': 0.05
-    }]
     for dtype in [np.float32]:
-        for options in itertools.product(eps_options, cg_options, rg_options,
-                                         l2_options):
+        for options in itertools.product(eps_options, cg_options, rg_options):
             kwarg = dict(wd=0.0)
             for option in options:
                 kwarg.update(option)
@@ -96,25 +78,21 @@ def test_proximal_group_adagrad():
                 shape,
                 dtype,
                 compare_states=False)
-            if kwarg.get('l2_regularization_strength', 0.0) == 0.0:
-                # By design results for PyOp which always performs
-                # dense update will differ if
-                # l2_regularization_strength > 0
-                compare_optimizer(
-                    opt1(**kwarg),
-                    opt2(**kwarg),
-                    shape,
-                    dtype,
-                    w_stype='row_sparse',
-                    g_stype='row_sparse',
-                    compare_states=False)
-                compare_optimizer(
-                    opt1(**kwarg),
-                    opt2(**kwarg),
-                    shape,
-                    dtype,
-                    g_stype='row_sparse',
-                    compare_states=False)
+            compare_optimizer(
+                opt1(**kwarg),
+                opt2(**kwarg),
+                shape,
+                dtype,
+                w_stype='row_sparse',
+                g_stype='row_sparse',
+                compare_states=False)
+            compare_optimizer(
+                opt1(**kwarg),
+                opt2(**kwarg),
+                shape,
+                dtype,
+                g_stype='row_sparse',
+                compare_states=False)
 
 
 if __name__ == '__main__':

From f26549cfbeae8dd656a993de72fb2e3da3377924 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <leonard@lausen.nl>
Date: Sat, 6 Oct 2018 08:57:13 +0000
Subject: [PATCH 3/4] Remove superfluous doc

---
 python/mxnet/optimizer/contrib.py    | 14 +-------------
 src/operator/contrib/optimizer_op.cc |  4 +---
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/python/mxnet/optimizer/contrib.py b/python/mxnet/optimizer/contrib.py
index 1baf2ff1020a..d269aa1bd069 100644
--- a/python/mxnet/optimizer/contrib.py
+++ b/python/mxnet/optimizer/contrib.py
@@ -43,19 +43,7 @@ class GroupAdaGrad(Optimizer):
         div = grad / sqrt(history + float_stable_eps)
         weight -= div * lr
 
-    Weights are updated lazily if the gradient is sparse. In particular, before
-    using a set of weights for a forward pass, you may want to ensure that the
-    lazily accumulated group lasso regularization is applied. This can be
-    achieved by creating a sparse gradient array that contains explicit 0 data
-    for the indices to be updated:
-
-        fake_grad = mx.nd.sparse.row_sparse_array(
-            (mx.nd.zeros((len(indices), dim)), indices))
-        weight.grad()[:] = fake_grad
-        weight.data()._fresh_grad = True
-        trainer._optimizer._index_update_count[0] -= 1
-        trainer._optimizer.num_update -= 1
-        trainer.step(batch_size=1)
+    Weights are updated lazily if the gradient is sparse.
 
     For details of the update algorithm see
     :class:`~mxnet.ndarray.contrib.group_adagrad_update`.
diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc
index 3abc70d6fdf3..31c07d7e3ee3 100644
--- a/src/operator/contrib/optimizer_op.cc
+++ b/src/operator/contrib/optimizer_op.cc
@@ -64,9 +64,7 @@ Updates are applied by::
     div = grad / sqrt(history + float_stable_eps)
     weight -= div * lr
 
-Weights are updated lazily if the gradient is sparse. In particular, before
-using a set of weights for a forward pass, you may want to ensure that the
-lazily accumulated group lasso regularization is applied.
+Weights are updated lazily if the gradient is sparse.
 
 Note that non-zero values for the weight decay option are not supported.
 

From df177c3fc98fc30d4daa967ef73a1095e1ca13e2 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <leonard@lausen.nl>
Date: Thu, 11 Oct 2018 11:04:35 +0000
Subject: [PATCH 4/4] Remove superfluous argument

---
 src/operator/contrib/optimizer_op.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc
index 31c07d7e3ee3..96f431bc569d 100644
--- a/src/operator/contrib/optimizer_op.cc
+++ b/src/operator/contrib/optimizer_op.cc
@@ -83,7 +83,6 @@ Note that non-zero values for the weight decay option are not supported.
 .add_argument("weight", "NDArray-or-Symbol", "Weight")
 .add_argument("grad", "NDArray-or-Symbol", "Gradient")
 .add_argument("history", "NDArray-or-Symbol", "History")
-.add_argument("last_update", "NDArray-or-Symbol", "Array storing last update counter for each row.")
 .add_arguments(GroupAdagradParam::__FIELDS__());
 
 }  // namespace op