diff --git a/contrib/svrg_optimization_python/README.md b/contrib/svrg_optimization_python/README.md deleted file mode 100644 index cc3b6bc41357..000000000000 --- a/contrib/svrg_optimization_python/README.md +++ /dev/null @@ -1,39 +0,0 @@ -## SVRG Optimization in Python Module - -### Problem Description -SVRG stands for Stochastic Variance Reduced Gradient, which was first introduced in the paper _Accelerating Stochastic -Gradient Descent using Predicative Variance Reduction_ in 2013. It is an optimization technique that complements SGD. -SGD is known for large scale optimization but it suffers from slow convergence asymptotically due to the inherent -variance. SGD approximates the full gradient using a small batch of samples which introduces variance. -In order to converge faster, SGD often needs to start with a smaller learning rate. SVRG remedies the problem by keeping -a version of the estimated weights that is close to the optimal parameters and maintain average of full gradient over -full pass of data. The average of full gradients of all data is calculated w.r.t to parameters of last mth epochs. -It has provable guarantees for strongly convex smooth functions, and a more detailed proof can be found in section 3 of -the paper. SVRG uses a different update rule: gradients w.r.t current parameters minus gradients w.r.t parameters -from the last mth epoch, plus the average of gradients over all data. - -#### Key Characteristics of SVRG -* Explicit Variance Reduction -* Ability to use relatively large learning rate compared to SGD, which leads to faster convergence compared to SGD. - -#### Testing: -Functional Tests: -* test_svrg_train.py: test script that tests both high-level and intermediate-level api for using SVRG -* test_svrg_inferency.py: test script for testing SVRGModule inference - -Unit Tests: -* test_svrg_module.py: unittests for SVRGModule API -* test_svrg_optimizer.py: unittests for SVRGOptimizer API - -#### Benchmarks: -An initial set of benchmarks has been performed on YearPredictionDatasetMSD with linear regression model. - -* benchmark1.py: A lr_scheduler returns a new learning rate based on the number of updates that have been performed. -The training loss of SVRG is less than SGD with lr_scheduler over all of the 100 epochs. - -* benchmark2.py: One drawback for SGD is that in order to converge faster, the learning rate has to decay to zero, -thus SGD needs to start with a small learning rate. The learning rate does not need to decay to zero for SVRG, -therefore we can use a relatively larger learning rate. SGD with learning rate of (0.001, 0.0025) and SVRG with -learning rate of (0.025) are benchmarked. Even though SVRG starts with a relatively large learning rate, it converges -much faster than SGD in both cases. -This particular experiment result aligns with what was stated in the SVRG paper section 5. \ No newline at end of file diff --git a/contrib/svrg_optimization_python/src/__init__.py b/contrib/svrg_optimization_python/src/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/contrib/svrg_optimization_python/tests/__init__.py b/contrib/svrg_optimization_python/tests/__init__.py deleted file mode 100644 index b7a3e645e0d5..000000000000 --- a/contrib/svrg_optimization_python/tests/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from __future__ import absolute_import -from ..src.svrg_module import SVRGModule -from ..src.svrg_optimizer import SVRGOptimizer - diff --git a/contrib/svrg_optimization_python/tests/test_svrg_module.py b/contrib/svrg_optimization_python/tests/test_svrg_module.py deleted file mode 100644 index 5118ae1656fb..000000000000 --- a/contrib/svrg_optimization_python/tests/test_svrg_module.py +++ /dev/null @@ -1,116 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -from ..src.svrg_module import SVRGModule -import mxnet as mx -import numpy as np - - -class TestSVRGModule(unittest.TestCase): - def setUp(self): - mx.random.seed(42) - train_data = np.random.randint(1, 5, [1000, 2]) - weights = np.array([1.0, 2.0]) - train_label = train_data.dot(weights) - - self.di = mx.io.NDArrayIter(train_data, train_label, batch_size=32, shuffle=True, label_name='lin_reg_label') - X = mx.sym.Variable('data') - Y = mx.symbol.Variable('lin_reg_label') - fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) - lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") - - self.mod = SVRGModule( - symbol=lro, - data_names=['data'], - label_names=['lin_reg_label'], update_freq=2) - self.mod.bind(data_shapes=self.di.provide_data, label_shapes=self.di.provide_label) - self.mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, - force_init=False, allow_extra=False) - - def test_create_module(self): - self.assertTrue(self.mod._mod_aux is not None) - - def test_bind_module(self): - self.assertTrue(self.mod.binded) - self.assertTrue(self.mod._mod_aux.binded) - - def test_module_initializer(self): - def regression_model(m): - x = mx.symbol.var("data", stype='csr') - v = mx.symbol.var("v", shape=(m, 1), init=mx.init.Uniform(scale=.1), - stype='row_sparse') - model = mx.symbol.dot(lhs=x, rhs=v) - y = mx.symbol.Variable("label") - model = mx.symbol.LinearRegressionOutput(data=model, label=y, name="out") - return model - - n, m = 128, 100 - model = regression_model(m) - - data = mx.nd.zeros(shape=(n, m), stype='csr') - label = mx.nd.zeros((n, 1)) - iterator = mx.io.NDArrayIter(data=data, label={'label': label}, - batch_size=n, last_batch_handle='discard') - - # create module - mod = SVRGModule(symbol=model, data_names=['data'], label_names=['label'], update_freq=2) - mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label) - mod.init_params() - v = mod._arg_params['v'] - self.assertEqual(v.stype, 'row_sparse') - self.assertTrue(np.sum(v.asnumpy()) != 0) - - @unittest.skip("SVRGModule with Pure SGD will not be a release feature") - def test_svrg_calculations(self): - def calc_svrg_optimization(update_freq): - mx.random.seed(42) - train_data = np.random.randint(1, 5, [1000, 2]) - weights = np.array([1.0, 2.0]) - train_label = train_data.dot(weights) - - di = mx.io.NDArrayIter(train_data, train_label, batch_size=32, shuffle=True, label_name='lin_reg_label') - X = mx.sym.Variable('data') - Y = mx.symbol.Variable('lin_reg_label') - fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) - lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") - - mod = SVRGModule( - symbol=lro, - data_names=['data'], - label_names=['lin_reg_label'], update_freq=update_freq) - mod.bind(data_shapes=self.di.provide_data, label_shapes=self.di.provide_label) - mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, force_init=False, allow_extra=False) - mod.init_optimizer(optimizer='sgd', optimizer_params=(('learning_rate', 0.01),)) - num_epoch = 100 - - metrics = mx.metric.create("mse") - for e in range(1, num_epoch + 1): - if e % (mod.update_freq + 1) == 0: - mod.update_full_grads(di) - di.reset() - metrics.reset() - for batch in di: - mod.forward_backward(data_batch=batch) - mod.update() - mod.update_metric(metrics, batch.label) - return metrics.get()[1] - - svrg_mse = calc_svrg_optimization(update_freq=2) - sgd_mse = calc_svrg_optimization(update_freq=101) - - self.assertTrue(svrg_mse - sgd_mse < 0) diff --git a/contrib/svrg_optimization_python/tests/test_svrg_optimizer.py b/contrib/svrg_optimization_python/tests/test_svrg_optimizer.py deleted file mode 100644 index 36d44f64b758..000000000000 --- a/contrib/svrg_optimization_python/tests/test_svrg_optimizer.py +++ /dev/null @@ -1,96 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -import unittest -from ..src.svrg_optimizer import SVRGOptimizer -from ..src.svrg_module import SVRGModule -import mxnet as mx -import numpy as np -from numpy.testing import assert_array_equal - - -class TestSVRGOPtimizer(unittest.TestCase): - @staticmethod - def create_network(): - mx.random.seed(42) - train_data = np.random.randint(1, 5, [1000, 2]) - weights = np.array([1.0, 2.0]) - train_label = train_data.dot(weights) - - batch_size = 32 - - di = mx.io.NDArrayIter(train_data, train_label, batch_size=batch_size, shuffle=True, label_name='lin_reg_label') - X = mx.sym.Variable('data') - Y = mx.symbol.Variable('lin_reg_label') - fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) - lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") - - mod = SVRGModule( - symbol=lro, - data_names=['data'], - label_names=['lin_reg_label'], update_freq=2 - ) - - mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) - mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, - force_init=False, allow_extra=False) - - return di, mod - - def test_init_svrg_optimizer(self): - di, mod = self.create_network() - - kv = mx.kv.create('local') - mod.init_optimizer(kvstore=kv, optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), - force_init=False) - - self.assertEqual(type(mod._optimizer).__name__, SVRGOptimizer.__name__) - - def test_svrg_optimizer_constructor(self): - _, mod = self.create_network() - - kv = mx.kv.create('local') - svrg_optimizer = SVRGOptimizer(default_optimizer='sgd', learning_rate=1.0) - kv.set_optimizer(svrg_optimizer) - - self.assertEqual(svrg_optimizer.default_opt.lr, 1.0) - - def test_kvstore_init_aux_keys(self): - param_idx2name= {0: "weight", 1: "weight_full"} - - svrg_optimizer = SVRGOptimizer(default_optimizer='sgd', param_idx2name= param_idx2name, learning_rate=1.0) - kv = mx.kv.create('local') - kv.set_optimizer(svrg_optimizer) - - param_weight_init = mx.nd.array([0, 0, 0]) - param_weight_update = mx.nd.array([1, 1, 1]) - - kv.init(0, param_weight_init) - kv.push(0, param_weight_update) - kv.pull(0, param_weight_init) - - param_weight_full_init = mx.nd.array([1, 1, 1]) - param_weight_full_update = mx.nd.array([2, 2, 2]) - - # Use AssignmentOptimizer - kv.init(1, param_weight_full_init) - kv.push(1, param_weight_full_update) - kv.pull(1, param_weight_full_init) - - assert_array_equal(param_weight_init.asnumpy(), np.array([-1, -1, -1])) - assert_array_equal(param_weight_full_init.asnumpy(), np.array([2, 2, 2])) diff --git a/docs/api/python/contrib/svrg_optimization.md b/docs/api/python/contrib/svrg_optimization.md new file mode 100644 index 000000000000..2b2c6812b864 --- /dev/null +++ b/docs/api/python/contrib/svrg_optimization.md @@ -0,0 +1,80 @@ +# SVRG Optimization in Python Module API + +## Overview +SVRG which stands for Stochastic Variance Reduced Gradients, is an optimization technique that complements SGD. It +employs explicit variance reduction and converges much faster compared to SGD for smooth and strongly convex functions. + +SVRG optimization is implemented as a SVRGModule in `mxnet.contrib.svrg_optimization`, which is an extension of the +existing `mxnet.module.Module` APIs and encapsulates SVRG optimization logic within several new functions. SVRGModule +API changes compared to Module API to end users are minimal. + +The current `SVRGModule` implements the standard SVRG optimization technique as described in _Accelerating Stochastic +Gradient Descent using Predicative Variance Reduction_ by calculating the gradients of all data +every `update_freq` epochs in the training. The SVRGModule update rule: gradients w.r.t current parameters minus gradients w.r.t parameters +from the last mth epoch, plus the average of gradients over all data. + +`SVRGOptimizer` wraps two optimizers, an AssignmentOptimizer which is used for full gradients accumulation in the KVStore and +a regular optimizer which is specified as a parameter to the `mod.init_optimizer()`. + +```eval_rst +.. warning:: This package contains experimental APIs and may change in the near future. +``` + +This document lists the svrg_optimization APIs in mxnet: + +```eval_rst +.. autosummary:: + :nosignatures: + + mxnet.contrib.svrg_optimization.SVRGModule + mxnet.contrib.svrg_optimization.SVRGOptimizer +``` + +### Intermediate Level API for SVRGModule + +The only extra step to use a SVRGModule compared to use a Module is to check if the current epoch should update the +full gradients over all data. Code snippets below demonstrate the suggested usage of SVRGModule using intermediate +level APIs. + +```python +>>> mod = SVRGModule(symbol=model, update_frequency=2, data_names=['data'], label_names=['lin_reg_label']) +>>> mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) +>>> mod.init_params() +>>> mod.init_optimizer(optimizer='sgd', optimizer_params=(('learning_rate', 0.01), )) +>>> for epoch in range(num_epochs): +... if epoch % mod.update_freq == 0: +... mod.update_full_grads(di) +... di.reset() +... for batch in di: +... mod.forward_backward(data_batch=batch) +... mod.update() +``` + +### High Level API for SVRGModule + +The high level API usage of SVRGModule remains exactly the same as Module API. Code snippets below gives an example of +suggested usage of high level API. + +```python +>>> mod = SVRGModule(symbol=model, update_frequency=2, data_names=['data'], label_names=['lin_reg_label']) +>>> mod.fit(di, num_epochs=100, optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), num_epochs=100) +``` + +## API reference + + + +```eval_rst + +.. automodule:: mxnet.contrib.svrg_optimization.svrg_module + :members: init_optimizer, _create_optimizer, bind, forward, backward, update, update_full_grads, + _accumulate_kvstore, _allocate_gradients, _svrg_grads_update_rule, update_svrg_gradients, fit, prepare + +.. automodule:: mxnet.contrib.svrg_optimization.svrg_optimizer.SVRGOptimizer + :members: _check_params, update, create_state, _check_index + +.. automodule:: mxnet.contrib.svrg_optimization.svrg_optimizer.AssignmentOptimizer + :members: update + +``` + \ No newline at end of file diff --git a/docs/api/python/module/module.md b/docs/api/python/module/module.md index 86ed74db6c19..662caa78ff93 100644 --- a/docs/api/python/module/module.md +++ b/docs/api/python/module/module.md @@ -58,6 +58,7 @@ The `module` package provides several modules: BucketingModule PythonModule PythonLossModule + SVRGModule ``` We summarize the interface for each class in the following sections. @@ -188,6 +189,23 @@ additional functionality. We summarize them in this section. SequentialModule.add ``` +### Class `SVRGModule` +SVRGModule is an extension to the Module API that implements SVRG (Stochastic Variance Reduced Gradients) optimization +logic. A few extra functions are defined to assist SVRG optimization update; however these functions are encapsulated in +Module's existing function calls and should not require explicit invocations by end users using high-level API. + +```eval_rst +.. autosummary:: + :nosignatures: + + SVRGModule.update_full_grads + SVRGModule.update_svrg_gradients + SVRGModule._svrg_grads_update_rule + SVRGModule._accumulate_kvstore + SVRGModule._allocate_gradients + SVRGModule._create_optimizer +``` + ## API Reference @@ -205,6 +223,8 @@ additional functionality. We summarize them in this section. :members: .. autoclass:: mxnet.module.PythonLossModule :members: +.. autoclass:: mxnet.contrib.svrg_optimization.SVRGModule + :members: ``` diff --git a/example/svrg_module/README.md b/example/svrg_module/README.md new file mode 100644 index 000000000000..23515c30633e --- /dev/null +++ b/example/svrg_module/README.md @@ -0,0 +1,26 @@ +## SVRG Module Example + +#### API Usage Example +SVRGModule provides both high-level and intermediate-level APIs while minimizing the changes with Module API. +example_api_train.py: provides suggested usage of SVRGModule high-level and intermediate-level API. +example_inference.py: provides example usage of SVRGModule inference. + +#### Linear Regression +This example trains a linear regression model using SVRGModule on a real dataset. Logs of the training results can be +found in experiments.log that will automatically generated when running the training script. + +YearPredictionMSD: dataset contains over 400,000 samples with 90 features. Please uncomment data downloading script from +data_reader.py to download the data. + + +#### Benchmarks: +An initial set of benchmarks has been performed on YearPredictionDatasetMSD with linear regression model. + +* benchmark1.py: A lr_scheduler returns a new learning rate based on the number of updates that have been performed. +The training loss of SVRG is less than SGD with lr_scheduler over all of the 100 epochs. + +* benchmark2.py: One drawback for SGD is that in order to converge faster, the learning rate has to decay to zero, +thus SGD needs to start with a small learning rate. The learning rate does not need to decay to zero for SVRG, +therefore we can use a relatively larger learning rate. SGD with learning rate of (0.001, 0.0025) and SVRG with +learning rate of (0.025) are benchmarked. Even though SVRG starts with a relatively large learning rate, it converges +much faster than SGD in both cases. \ No newline at end of file diff --git a/contrib/svrg_optimization_python/test_svrg_train.py b/example/svrg_module/api_usage_example/example_api_train.py similarity index 63% rename from contrib/svrg_optimization_python/test_svrg_train.py rename to example/svrg_module/api_usage_example/example_api_train.py index 36e8ce448731..2dbb8c710a1c 100644 --- a/contrib/svrg_optimization_python/test_svrg_train.py +++ b/example/svrg_module/api_usage_example/example_api_train.py @@ -18,14 +18,19 @@ import mxnet as mx import numpy as np -from src.svrg_module import SVRGModule +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule -def test_svrg_intermediate_level_api(num_epoch): - """Test intermediate level svrgmodule API where the training process +def test_svrg_intermediate_level_api(args): + """Demonstrates intermediate level SVRGModule API where the training process need to be explicitly defined. KVstore is not explicitly created. """ - di, mod = create_network() + num_epoch = args.epochs + batch_size = args.batch_size + update_freq = args.update_freq + + di, mod = create_network(batch_size, update_freq) + mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, force_init=False, allow_extra=False) kv = mx.kv.create("local") @@ -33,35 +38,43 @@ def test_svrg_intermediate_level_api(num_epoch): metrics = mx.metric.create("mse") for e in range(num_epoch): metrics.reset() - if e % (mod.update_freq) == 0: + if e % mod.update_freq == 0: + print("Update") mod.update_full_grads(di) di.reset() for batch in di: mod.forward_backward(data_batch=batch) mod.update() mod.update_metric(metrics, batch.label) - print('Epoch[%d] Time cost=%.3f', e, metrics.get()) + mod.logger.info('Epoch[%d] Train cost=%f', e, metrics.get()[1]) -def test_svrg_high_level_api(num_epoch): - """Test high level svrgmodule API. KVStore is explicitly created. +def test_svrg_high_level_api(args): + """Demonstrates suggested usage of high level SVRGModule API. KVStore is explicitly created. """ - di, mod = create_network() + num_epoch = args.epochs + batch_size = args.batch_size + update_freq = args.update_freq + + di, mod = create_network(batch_size, update_freq) mod.fit(di, eval_metric='mse', optimizer='sgd', optimizer_params=(('learning_rate', 0.025),), num_epoch=num_epoch, kvstore='local') -def create_network(): +def create_network(batch_size, update_freq): """Create a linear regression network for performing SVRG optimization. :return: an instance of mx.io.NDArrayIter :return: an instance of mx.mod.svrgmodule for performing SVRG optimization """ - mx.random.seed(42) + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) + train_data = np.random.randint(1, 5, [1000, 2]) weights = np.array([1.0, 2.0]) train_label = train_data.dot(weights) - di = mx.io.NDArrayIter(train_data, train_label, batch_size=32, shuffle=True, label_name='lin_reg_label') + di = mx.io.NDArrayIter(train_data, train_label, batch_size=batch_size, shuffle=True, label_name='lin_reg_label') X = mx.sym.Variable('data') Y = mx.symbol.Variable('lin_reg_label') fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) @@ -70,15 +83,23 @@ def create_network(): mod = SVRGModule( symbol=lro, data_names=['data'], - label_names=['lin_reg_label'], update_freq=2 + label_names=['lin_reg_label'], update_freq=update_freq, logger=logging ) return di, mod # run as a script if __name__ == "__main__": - num_epoch = 100 + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('-e', dest='epochs', default=100, type=int) + parser.add_argument('-bs', dest='batch_size', default=32, type=int) + parser.add_argument('-f', dest="update_freq", default=2, type=int) + args = parser.parse_args() + + print("========================== Intermediate Level API ==========================") - test_svrg_intermediate_level_api(num_epoch) + test_svrg_intermediate_level_api(args) print("========================== High Level API ==========================") - test_svrg_high_level_api(num_epoch) + # test_svrg_high_level_api(args) diff --git a/contrib/svrg_optimization_python/test_svrg_inference.py b/example/svrg_module/api_usage_example/example_inference.py similarity index 61% rename from contrib/svrg_optimization_python/test_svrg_inference.py rename to example/svrg_module/api_usage_example/example_inference.py index 0250cdec5899..312f9796074d 100644 --- a/contrib/svrg_optimization_python/test_svrg_inference.py +++ b/example/svrg_module/api_usage_example/example_inference.py @@ -18,23 +18,34 @@ import mxnet as mx import numpy as np -from src.svrg_module import SVRGModule +import logging +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule -def test_svrg_inference(num_epoch): - train_iter, val_iter, mod = create_network() - mod.fit(train_iter, eval_data=val_iter, eval_metric='mse', optimizer='sgd', optimizer_params=(('learning_rate', 0.025),), - num_epoch=num_epoch) +def test_svrg_inference(args): + epoch = args.epochs + batch_size = args.batch_size + update_freq = args.update_freq -def test_score(num_epoch): - train_iter, val_iter, mod = create_network() + train_iter, val_iter, mod = create_network(batch_size, update_freq) + mod.fit(train_iter, eval_data=val_iter, eval_metric='mse', optimizer='sgd', + optimizer_params=(('learning_rate', 0.025),), + num_epoch=epoch) + + +def get_validation_score(args): + epoch = args.epochs + batch_size = args.batch_size + update_freq = args.update_freq + + train_iter, val_iter, mod = create_network(batch_size, update_freq) mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, force_init=False, allow_extra=False) - mod.init_optimizer(kvstore='local', optimizer='nag', optimizer_params=(('momentum', 0.9),)) + mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.025),)) metrics = mx.metric.create("mse") - for e in range(num_epoch): + for e in range(epoch): metrics.reset() - if e % (mod.update_freq + 1) == 0: + if e % mod.update_freq == 0: mod.update_full_grads(train_iter) train_iter.reset() for batch in train_iter: @@ -43,25 +54,29 @@ def test_score(num_epoch): mod.update_metric(metrics, batch.label) y = mod.predict(val_iter) + + # test-train data split, 20% test data out of 1000 data samples assert y.shape == (200, 1) score = mod.score(val_iter, ['mse']) - print("Training Loss is %f", score[0][1]) + print("Training Loss on Validation Set is {}".format(score[0][1])) -def create_network(): +def create_network(batch_size, update_freq): """Create a linear regression network for performing SVRG optimization. :return: an instance of mx.io.NDArrayIter :return: an instance of mx.mod.svrgmodule for performing SVRG optimization """ - mx.random.seed(42) + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) data = np.random.randint(1, 5, [1000, 2]) + + #Test_Train data split n_train = int(data.shape[0] * 0.8) weights = np.array([1.0, 2.0]) label = data.dot(weights) - - di = mx.io.NDArrayIter(data[:n_train, :], label[:n_train], batch_size=32, shuffle=True, label_name='lin_reg_label') - val_iter = mx.io.NDArrayIter(data[n_train:, :], label[n_train:], batch_size=32) + di = mx.io.NDArrayIter(data[:n_train, :], label[:n_train], batch_size=batch_size, shuffle=True, label_name='lin_reg_label') + val_iter = mx.io.NDArrayIter(data[n_train:, :], label[n_train:], batch_size=batch_size) X = mx.sym.Variable('data') Y = mx.symbol.Variable('lin_reg_label') @@ -71,16 +86,21 @@ def create_network(): mod = SVRGModule( symbol=lro, data_names=['data'], - label_names=['lin_reg_label'], update_freq=2 - ) + label_names=['lin_reg_label'], update_freq=update_freq, logger=logging) return di, val_iter, mod # run as a script if __name__ == "__main__": - num_epoch = 100 + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-e', dest='epochs', default=100, type=int) + parser.add_argument('-bs', dest='batch_size', default=32, type=int) + parser.add_argument('-f', dest="update_freq", default=2, type=int) + args = parser.parse_args() + print("========================== SVRG Module Inference ==========================") - test_svrg_inference(num_epoch) + test_svrg_inference(args) print("========================SVRG Module Score ============================") - test_score(num_epoch) + get_validation_score(args) diff --git a/contrib/svrg_optimization_python/benchmarks/benchmark1.png b/example/svrg_module/benchmarks/benchmark1.png similarity index 100% rename from contrib/svrg_optimization_python/benchmarks/benchmark1.png rename to example/svrg_module/benchmarks/benchmark1.png diff --git a/contrib/svrg_optimization_python/benchmarks/benchmark2.png b/example/svrg_module/benchmarks/benchmark2.png similarity index 100% rename from contrib/svrg_optimization_python/benchmarks/benchmark2.png rename to example/svrg_module/benchmarks/benchmark2.png diff --git a/example/svrg_module/linear_regression/common.py b/example/svrg_module/linear_regression/common.py new file mode 100644 index 000000000000..0b3d19b409c9 --- /dev/null +++ b/example/svrg_module/linear_regression/common.py @@ -0,0 +1,118 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import mxnet as mx +import logging +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule + + +def create_lin_reg_network(train_features, train_labels, feature_dim, batch_size, update_freq, ctx, logger): + # fit a linear regression model with mxnet SVRGModule + print("Fitting linear regression with mxnet") + train_iter = mx.io.NDArrayIter(train_features, train_labels, batch_size=batch_size, shuffle=True, + data_name='data', label_name='label') + data = mx.sym.Variable("data") + label = mx.sym.Variable("label") + weight = mx.sym.Variable("fc_weight", shape=(1, feature_dim)) + net = mx.sym.dot(data, weight.transpose()) + bias = mx.sym.Variable("fc_bias", shape=(1,), wd_mult=0.0, lr_mult=10.0) + net = mx.sym.broadcast_plus(net, bias) + net = mx.sym.LinearRegressionOutput(data=net, label=label) + + mod = SVRGModule(symbol=net, context=ctx, data_names=['data'], label_names=['label'], logger=logger, + update_freq=update_freq) + return train_iter, mod + + +def create_metrics(metrics): + metric = mx.metric.create(metrics) + return metric + + +def create_logger(): + logger = logging.getLogger('sgd_svrg') + logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(message)s') + fh = logging.FileHandler('experiments.log') + fh.setFormatter(formatter) + logger.addHandler(fh) + return logger + + +################################################################################ +# Functions below are for benchmark purpose to calcuate expectation, variance of +# gradients per epoch for each parameter. These calculations will be helpful when +# benchmarking SVRG optimization with other optimization techniques, such as SGD. +# Currently it only calculates the expectation, variance for single context but +# can be extended to multi-context in later iterations. +################################################################################ + +def accumulate_grad(grad_dict, mod): + param_names = mod._exec_group.param_names + + for index, name in enumerate(param_names): + if name not in grad_dict: + grad_dict[name] = mod._exec_group.grad_arrays[index][0].copy() + else: + grad_dict[name] = mx.ndarray.concat(grad_dict[name], mod._exec_group.grad_arrays[index][0], dim=0) + + +def calc_expectation(grad_dict, num_batches): + """Calculates the expectation of the gradients per epoch for each parameter w.r.t number of batches + + Parameters + ---------- + grad_dict: dict + dictionary that maps parameter name to gradients in the mod executor group + num_batches: int + number of batches + + Returns + ---------- + grad_dict: dict + dictionary with new keys mapping to gradients expectations + + """ + for key in grad_dict.keys(): + grad_dict[str.format(key+"_expectation")] = mx.ndarray.sum(grad_dict[key], axis=0) / num_batches + + return grad_dict + + +def calc_variance(grad_dict, num_batches, param_names): + """Calculates the variance of the gradients per epoch for each parameter w.r.t number of batches + + Parameters + ---------- + grad_dict: dict + dictionary that maps parameter name to gradients in the mod executor group + num_batches: int + number of batches + param_names: str + parameter name in the module + + Returns + ---------- + grad_dict: dict + dictionary with new keys mapping to gradients variance + + """ + for i in range(len(param_names)): + diff_sqr = mx.ndarray.square(mx.nd.subtract(grad_dict[param_names[i]], + grad_dict[str.format(param_names[i]+"_expectation")])) + grad_dict[str.format(param_names[i] + "_variance")] = mx.ndarray.sum(diff_sqr, axis=0) / num_batches diff --git a/example/svrg_module/linear_regression/data_reader.py b/example/svrg_module/linear_regression/data_reader.py new file mode 100644 index 000000000000..d1578fc153bf --- /dev/null +++ b/example/svrg_module/linear_regression/data_reader.py @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import numpy as np + + +def read_year_prediction_data(fileName): + # Download data file + # from subprocess import call + # call(['wget', 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/YearPredictionMSD.bz2']) + # call(['bzip2', '-d', 'YearPredictionMSD.bz2']) + + from sklearn.datasets import load_svmlight_file + + # YearPredictionMSD dataset: https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd + feature_dim = 90 + print("Reading data from disk...") + train_features, train_labels = load_svmlight_file(fileName, n_features=feature_dim, dtype=np.float32) + train_features = train_features.todense() + + # normalize the data: subtract means and divide by standard deviations + label_mean = train_labels.mean() + label_std = np.sqrt(np.square(train_labels - label_mean).mean()) + feature_means = train_features.mean(axis=0) + feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0)) + + train_features = (train_features - feature_means) / feature_stds + train_labels = (train_labels - label_mean) / label_std + + return feature_dim, train_features, train_labels diff --git a/example/svrg_module/linear_regression/train.py b/example/svrg_module/linear_regression/train.py new file mode 100644 index 000000000000..b3d942973f19 --- /dev/null +++ b/example/svrg_module/linear_regression/train.py @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import argparse +import mxnet as mx +from common import create_lin_reg_network, create_logger +from data_reader import read_year_prediction_data + +parser = argparse.ArgumentParser() +parser.add_argument('-e', dest='epochs', help='number of epochs for training phase', type=int, default=100) +parser.add_argument('-f', dest="updateFreq", help="update frequency for SVRGModule", type=int, default=2) +parser.add_argument('-b', dest="batch_size", help="define the batch size for training", type=int, + default=100, required=False) +parser.add_argument('-m', dest='metrics', help="create eval metric", type=str, default='mse') +parser.add_argument('--gpus', type=str, help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu') +parser.add_argument('--kv-store', type=str, default='local', help='key-value store type') + +args = parser.parse_args() +# devices for training +ctx = mx.cpu() if args.gpus is None or args.gpus == "" else [mx.gpu(int(i)) for i in args.gpus.split(',')] + +logger = create_logger() +kv = mx.kvstore.create(args.kv_store) + +feature_dim, train_features, train_labels = read_year_prediction_data('YearPredictionMSD') +train_iter, mod = create_lin_reg_network(train_features, train_labels, feature_dim, args.batch_size, args.updateFreq, + ctx, logger) + +mod.fit(train_iter, eval_metric='mse', optimizer='sgd', + optimizer_params=(('learning_rate', 0.025), ), num_epoch=args.epochs, kvstore=kv) diff --git a/contrib/svrg_optimization_python/__init__.py b/python/mxnet/contrib/svrg_optimization/__init__.py similarity index 86% rename from contrib/svrg_optimization_python/__init__.py rename to python/mxnet/contrib/svrg_optimization/__init__.py index 4acf63ef7a13..6e70009983c9 100644 --- a/contrib/svrg_optimization_python/__init__.py +++ b/python/mxnet/contrib/svrg_optimization/__init__.py @@ -14,7 +14,9 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""SVRGModule, SVRGOptimization import. +""" -from __future__ import absolute_import -from .src.svrg_optimizer import SVRGOptimizer -from .src.svrg_module import SVRGModule + +from . import svrg_module +from . import svrg_optimizer diff --git a/contrib/svrg_optimization_python/src/svrg_module.py b/python/mxnet/contrib/svrg_optimization/svrg_module.py similarity index 87% rename from contrib/svrg_optimization_python/src/svrg_module.py rename to python/mxnet/contrib/svrg_optimization/svrg_module.py index e587da00eb25..f78c9f363964 100644 --- a/contrib/svrg_optimization_python/src/svrg_module.py +++ b/python/mxnet/contrib/svrg_optimization/svrg_module.py @@ -18,11 +18,11 @@ SVRG optimization logic. """ -import mxnet as mx import time import logging -from svrg_optimizer import SVRGOptimizer +import mxnet as mx from mxnet.module import Module +from .svrg_optimizer import SVRGOptimizer class SVRGModule(Module): @@ -62,7 +62,7 @@ class SVRGModule(Module): Examples -------- >>> # An example of declaring and using SVRGModule. - >>> mod = mod = SVRGModule(symbol=lro, data_names=['data'], label_names=['lin_reg_label'], update_freq=2) + >>> mod = SVRGModule(symbol=lro, data_names=['data'], label_names=['lin_reg_label'], update_freq=2) >>> mod.fit(di, eval_metric='mse', optimizer='sgd', optimizer_params=(('learning_rate', 0.025),), >>> num_epoch=num_epoch, kvstore='local') """ @@ -78,30 +78,35 @@ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',), # Type check update_frequency if isinstance(update_freq, int): + if update_freq <= 0: + raise ValueError("update_freq in SVRGModule must be a positive integer to represent the frequency for " + "calculating full gradients") self.update_freq = update_freq else: - raise TypeError("update_freq must be an integer") + raise TypeError("update_freq in SVRGModule must be a positive integer to represent the frequency for " + "calculating full gradients") self._mod_aux = mx.mod.Module(symbol, data_names, label_names, logger, context, work_load_list, fixed_param_names, state_names, group2ctxs, compression_params) - self._param_dict = [{} for ctx in self._context] + self._param_dict = None + self._ctx_len = len(self._context) def _reset_bind(self): """Internal function to reset binded state.""" super(SVRGModule, self)._reset_bind() self._mod_aux._reset_bind() - def reshape(self, data_shapes, label_shapes=None): super(SVRGModule, self).reshape(data_shapes, label_shapes=label_shapes) self._mod_aux.reshape(data_shapes, label_shapes=label_shapes) def init_optimizer(self, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), force_init=False): - """Installs and initializes SVRG optimizers. The SVRGOptimizer is a wrapper for normal SGD optimizer - and special AssignmentOptimizer to accumulate gradients. If KVStore exists, additional keys will be - pushed to the kvstore for accumulating full grads. + """Installs and initializes SVRGOptimizer. The SVRGOptimizer is a wrapper class for a regular optimizer that is + passed in and a special AssignmentOptimizer to accumulate the full gradients. If KVStore is 'local' or None, + the full gradients will be accumulated locally without pushing to the KVStore. Otherwise, additional keys will + be pushed to accumulate the full gradients in the KVStore. Parameters ---------- @@ -117,15 +122,15 @@ def init_optimizer(self, kvstore='local', optimizer='sgd', optimizer in the case an optimizer is already installed. """ # Init dict for storing average of full gradients for each device - for i in range(len(self._context)): - self._param_dict[i] = {key: mx.nd.zeros(shape=value.shape, ctx=self._context[i]) - for key, value in self.get_params()[0].items()} + + self._param_dict = [{key: mx.nd.zeros(shape=value.shape, ctx=self._context[i]) + for key, value in self.get_params()[0].items()} for i in range(self._ctx_len)] svrg_optimizer = self._create_optimizer(SVRGOptimizer.__name__, default_opt=optimizer, kvstore=kvstore, optimizer_params=optimizer_params) super(SVRGModule, self).init_optimizer(kvstore=kvstore, optimizer=svrg_optimizer, - optimizer_params=optimizer_params, force_init=force_init) + optimizer_params=optimizer_params, force_init=force_init) # Init additional keys for accumulating full grads in KVStore if self._kvstore: @@ -153,7 +158,7 @@ def _create_optimizer(self, optimizer, default_opt, kvstore, optimizer_params): # code partially copied from mxnet module.init_optimizer() to accomodate svrg_optimizer batch_size = self._exec_group.batch_size - (kv_store, update_on_kvstore) = mx.model._create_kvstore(kvstore, len(self._context), self._arg_params) + (kv_store, update_on_kvstore) = mx.model._create_kvstore(kvstore, self._ctx_len, self._arg_params) if kv_store and 'dist' in kv_store.type and '_sync' in kv_store.type: batch_size *= kv_store.num_workers rescale_grad = 1.0 / batch_size @@ -162,8 +167,8 @@ def _create_optimizer(self, optimizer, default_opt, kvstore, optimizer_params): if update_on_kvstore: idx2name.update(enumerate(self._exec_group.param_names)) else: - for k in range(len(self._context)): - idx2name.update({i * len(self._context) + k: n + for k in range(self._ctx_len): + idx2name.update({i * self._ctx_len + k: n for i, n in enumerate(self._exec_group.param_names)}) # update idx2name to include new keys @@ -182,8 +187,7 @@ def _create_optimizer(self, optimizer, default_opt, kvstore, optimizer_params): return optimizer def bind(self, data_shapes, label_shapes=None, for_training=True, - inputs_need_grad=False, force_rebind=False, shared_module=None, - grad_req='write'): + inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req='write'): """Binds the symbols to construct executors for both two modules. This is necessary before one can perform computation with the SVRGModule. @@ -259,7 +263,7 @@ def backward(self, out_grads=None): super(SVRGModule, self).backward(out_grads) if self._mod_aux.binded: - self._mod_aux.backward() + self._mod_aux.backward(out_grads) def update(self): """Updates parameters according to the installed optimizer and the gradients computed @@ -301,49 +305,45 @@ def update_full_grads(self, train_data): self._mod_aux.backward() nbatch += 1 - for i in range(len(self._context)): - for j in range(len(param_names)): - grads = self._mod_aux._exec_group.grad_arrays[j][i] - self._param_dict[i][param_names[j]] = mx.nd.broadcast_add(self._param_dict[i][param_names[j]], - grads, axis=0) + for ctx in range(self._ctx_len): + for index, name in enumerate(param_names): + grads = self._mod_aux._exec_group.grad_arrays[index][ctx] + self._param_dict[ctx][name] = mx.nd.broadcast_add(self._param_dict[ctx][name], grads, axis=0) padding = batch.pad # Average full gradients over number of batches, accumulate in the kvstore if kvstore is set - for i in range(len(self._context)): - for key in self._param_dict[i].keys(): - self._param_dict[i][key] /= (nbatch - padding / train_data.batch_size) + for i in range(self._ctx_len): + for name in param_names: + self._param_dict[i][name] /= (nbatch - padding / train_data.batch_size) if self._kvstore: # Push a list of gradients from each device in the KVStore - for key in self._param_dict[0].keys(): - grad_list = [] - for i in range(len(self._param_dict)): - grad_list.append(self._param_dict[i][key]) - - self._accumulate_kvstore(key, grad_list) + for name in param_names: + grad_list = list(self._param_dict[i][name] for i in range(self._ctx_len)) + self._accumulate_kvstore(name, grad_list) def _accumulate_kvstore(self, key, value): - """Accumulate gradients over all data in the KVstore. In distributed setting, each worker sees a portion of + """Accumulate gradients over all data in the KVStore. In distributed setting, each worker sees a portion of data. The full gradients will be aggregated from each worker in the KVStore. Parameters ---------- key: int or str - Key in the kvstore. + Key in the KVStore. value: NDArray, RowSparseNDArray Average of the full gradients. """ # Accumulate full gradients for current epochs self._kvstore.push(key + "_full", value) - self._kvstore._barrier() self._kvstore.pull(key + "_full", value) + self._allocate_gradients(key, value) def _allocate_gradients(self, key, value): - """Allocate averge of full gradients accumualted in the KVStore to each device. + """Allocate average of full gradients accumulated in the KVStore to each device. Parameters ---------- @@ -354,11 +354,11 @@ def _allocate_gradients(self, key, value): A list of average of the full gradients in the KVStore. """ - num_device = len(self._context) - for i in range(len(self._param_dict)): - self._param_dict[i][key] = value[i] / num_device + for i in range(self._ctx_len): + self._param_dict[i][key] = value[i] / self._ctx_len - def _svrg_grads_update_rule(self, g_curr_batch_curr_weight, g_curr_batch_special_weight, g_special_weight_all_batch): + def _svrg_grads_update_rule(self, g_curr_batch_curr_weight, g_curr_batch_special_weight, + g_special_weight_all_batch): """Calculates the gradient based on the SVRG update rule. Parameters ---------- @@ -374,22 +374,24 @@ def _svrg_grads_update_rule(self, g_curr_batch_curr_weight, g_curr_batch_special Gradients calculated using SVRG update rule: grads = g_curr_batch_curr_weight - g_curr_batch_special_weight + g_special_weight_all_batch """ - for i in range(len(g_curr_batch_curr_weight)): - g_curr_batch_curr_weight[i] -= g_curr_batch_special_weight[i] - g_curr_batch_curr_weight[i] += g_special_weight_all_batch[i] + + for index, grad in enumerate(g_curr_batch_curr_weight): + grad -= g_curr_batch_special_weight[index] + grad += g_special_weight_all_batch[index] return g_curr_batch_curr_weight def update_svrg_gradients(self): """Calculates gradients based on the SVRG update rule. """ param_names = self._exec_group.param_names - for i in range(len(self._context)): - for j in range(len(param_names)): - g_curr_batch_reg = self._exec_group.grad_arrays[j][i] - g_curr_batch_special = self._mod_aux._exec_group.grad_arrays[j][i] - g_special_weight_all_batch = self._param_dict[i][param_names[j]] - g_svrg = self._svrg_grads_update_rule(g_curr_batch_reg, g_curr_batch_special, g_special_weight_all_batch) - self._exec_group.grad_arrays[j][i] = g_svrg + for ctx in range(self._ctx_len): + for index, name in enumerate(param_names): + g_curr_batch_reg = self._exec_group.grad_arrays[index][ctx] + g_curr_batch_special = self._mod_aux._exec_group.grad_arrays[index][ctx] + g_special_weight_all_batch = self._param_dict[ctx][name] + g_svrg = self._svrg_grads_update_rule(g_curr_batch_reg, g_curr_batch_special, + g_special_weight_all_batch) + self._exec_group.grad_arrays[index][ctx] = g_svrg def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', @@ -461,6 +463,7 @@ def fit(self, train_data, eval_data=None, eval_metric='acc', str -> NDArray. The resulting dict is used for pulling row_sparse parameters from the kvstore, where the str key is the name of the param, and the value is the row id of the param to pull. + validation_metric: """ assert num_epoch is not None, 'please specify number of epochs' @@ -471,8 +474,7 @@ def fit(self, train_data, eval_data=None, eval_metric='acc', self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) - self.init_optimizer(kvstore=kvstore, optimizer=optimizer, - optimizer_params=optimizer_params) + self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric @@ -503,9 +505,7 @@ def fit(self, train_data, eval_data=None, eval_metric='acc', self.update() if isinstance(data_batch, list): - self.update_metric(eval_metric, - [db.label for db in data_batch], - pre_sliced=True) + self.update_metric(eval_metric, [db.label for db in data_batch], pre_sliced=True) else: self.update_metric(eval_metric, data_batch.label) @@ -533,7 +533,6 @@ def fit(self, train_data, eval_data=None, eval_metric='acc', self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) - print('Epoch[%d] Time cost=%.3f', epoch, eval_metric.get()) # sync aux params across devices arg_params, aux_params = self.get_params() @@ -551,7 +550,6 @@ def fit(self, train_data, eval_data=None, eval_metric='acc', batch_end_callback=eval_batch_end_callback, epoch=epoch) for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) - print('Epoch[%d] Validation-%s=%f', epoch, name, val) def prepare(self, data_batch, sparse_row_id_fn=None): """Prepares two modules for processing a data batch. @@ -577,5 +575,5 @@ def prepare(self, data_batch, sparse_row_id_fn=None): parameters from the kvstore, where the str key is the name of the param, and the value is the row id of the param to pull. """ - super(SVRGModule, self).prepare(data_batch, sparse_row_id_fn=sparse_row_id_fn) - self._mod_aux.prepare(data_batch=sparse_row_id_fn) + super(SVRGModule, self).prepare(data_batch, sparse_row_id_fn) + self._mod_aux.prepare(data_batch, sparse_row_id_fn) diff --git a/contrib/svrg_optimization_python/src/svrg_optimizer.py b/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py similarity index 63% rename from contrib/svrg_optimization_python/src/svrg_optimizer.py rename to python/mxnet/contrib/svrg_optimization/svrg_optimizer.py index bf9cca975cce..dd4b363c38d7 100644 --- a/contrib/svrg_optimization_python/src/svrg_optimizer.py +++ b/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py @@ -23,46 +23,82 @@ @mx.optimizer.register class AssignmentOptimizer(mx.optimizer.Optimizer): + """AssignmentOptimizer assigns gradients to weights for SVRGModule's full gradients + accumulation in the KVStore + """ def update(self, index, weight, grad, state): + """Assign the gradients to weight for accumulating full gradients in the KVStore across all devices and workers. + + Parameters + ---------- + index : int + The unique index of the parameter into the individual learning + rates and weight decays. Learning rates and weight decay + may be set via `set_lr_mult()` and `set_wd_mult()`, respectively. + weight : NDArray + The parameter to be updated. + grad : NDArray + The gradient of the objective with respect to this parameter. + state: any obj + AssignmentOptimizer will not need to be associated with state. + """ + weight[:] = grad + @mx.optimizer.register class SVRGOptimizer(mx.optimizer.Optimizer): - """SVRGOptimizer is a wrapper class for two optimizers: one for accumulating full gradients and the other - one is the passed-in optimizer. + """SVRGOptimizer is a wrapper class for two optimizers: AssignmentOptimizer for accumulating full gradients in the + KVStore and a default optimizer that is passed in as a parameter in `mod.init_optimizer()` + + This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`. Parameters ---------- - default_optimizer: optimizer passed-in when invoke on mx.mod.init_optimizer + default_optimizer: str or Optimizer + Optimizer passed-in when invoke on mx.mod.init_optimizer in SVRGModule """ - def __init__(self, default_optimizer, **kwargs): + def __init__(self, default_optimizer, **kwargs): # Reconstruct kwargs to identify additional params for default optimizer - extra_param, default_param = self._check_params(**kwargs) - super(SVRGOptimizer, self).__init__(**default_param) + base_param = self._check_params(**kwargs) + super(SVRGOptimizer, self).__init__(**base_param) if isinstance(default_optimizer, str): self.default_opt = mx.optimizer.create(default_optimizer, **kwargs) else: self.default_opt = default_optimizer self.aux_opt = mx.optimizer.create(AssignmentOptimizer.__name__) + @staticmethod + def _check_params(**kwargs): + """ Reassemble kwargs to identify additional optimizer params for default optimizers. base_params contains + all the param names in base class Optimizer. + + Parameters + ---------- + kwargs: dict + Parameters for the default optimizer + + Returns + ---------- + default_params: dict + Optimizer parameters that are defined in base class Optimizer + """ - def _check_params(self, **kwargs): optimizer_param = dict(kwargs) base_params = ['rescale_grad', 'param_idx2name', 'wd', 'clip_gradient', 'learning_rate', 'lr_scheduler', 'sym', 'begin_num_update', 'multi_precision', 'param_dict'] - extra_param = {} + default_params = {} - for key in optimizer_param.keys(): + for key, _ in optimizer_param.items(): if key in base_params: default_params[key] = optimizer_param[key] - else: - extra_param[key] = optimizer_param[key] - return extra_param, default_params + + return default_params def update(self, index, weight, grad, state): """Updates the given parameter using the corresponding gradient and state. If key contains 'full', update with - lr = -1 otherwise will use default optimizer. + AssignmentOptimizer otherwise will use default optimizer. Parameters ---------- @@ -80,7 +116,7 @@ def update(self, index, weight, grad, state): name = self._check_index(index) - if "full".lower() in name: + if "full" in name: self.aux_opt.update(index, weight, grad, state) else: # use the default optimizer @@ -106,26 +142,28 @@ def create_state(self, index, weight): """ name = self._check_index(index) - if "full".lower() in name: - return + if "full" in name: + return self.aux_opt.create_state(index, weight) else: - # use the default optimizer + # return self.default_opt.create_state(index, weight) def _check_index(self, index): """Check index in idx2name to get corresponding param_name Parameters ---------- - index : int + index : int or str An unique index to identify the weight. Returns ------- name : str - Name of the Module parameter + Name of the Module parameter """ if index in self.idx2name.values(): + # index is a str name = index else: + # index is an int name = self.idx2name[index] return name diff --git a/tests/python/unittest/legacy_ndarray.v0 b/tests/python/unittest/legacy_ndarray.v0 deleted file mode 100644 index f4306d837202..000000000000 Binary files a/tests/python/unittest/legacy_ndarray.v0 and /dev/null differ diff --git a/tests/python/unittest/save_000800.json b/tests/python/unittest/save_000800.json deleted file mode 100644 index 7b385e2983d8..000000000000 --- a/tests/python/unittest/save_000800.json +++ /dev/null @@ -1,188 +0,0 @@ -{ - "nodes": [ - { - "op": "null", - "param": {}, - "name": "data", - "inputs": [], - "backward_source_id": -1, - "attr": { - "ctx_group": "stage1", - "lr_mult": "0.2" - } - }, - { - "op": "null", - "param": {}, - "name": "fc1_weight", - "inputs": [], - "backward_source_id": -1, - "attr": { - "ctx_group": "stage1", - "wd_mult": "0.3", - "weight_lr_mult": "1.2" - } - }, - { - "op": "null", - "param": {}, - "name": "fc1_bias", - "inputs": [], - "backward_source_id": -1, - "attr": { - "ctx_group": "stage1", - "wd_mult": "0.3", - "weight_lr_mult": "1.2" - } - }, - { - "op": "FullyConnected", - "param": { - "no_bias": "False", - "num_hidden": "128" - }, - "name": "fc1", - "inputs": [[0, 0], [1, 0], [2, 0]], - "backward_source_id": -1, - "attr": { - "ctx_group": "stage1", - "wd_mult": "0.3", - "weight_lr_mult": "1.2" - } - }, - { - "op": "Activation", - "param": {"act_type": "relu"}, - "name": "relu1", - "inputs": [[3, 0]], - "backward_source_id": -1, - "attr": {"ctx_group": "stage1"} - }, - { - "op": "null", - "param": {}, - "name": "fc2_weight", - "inputs": [], - "backward_source_id": -1, - "attr": { - "ctx_group": "stage2", - "lr_mult": "0.01" - } - }, - { - "op": "null", - "param": {}, - "name": "fc2_bias", - "inputs": [], - "backward_source_id": -1, - "attr": { - "ctx_group": "stage2", - "lr_mult": "0.01" - } - }, - { - "op": "FullyConnected", - "param": { - "no_bias": "False", - "num_hidden": "64" - }, - "name": "fc2", - "inputs": [[4, 0], [5, 0], [6, 0]], - "backward_source_id": -1, - "attr": { - "ctx_group": "stage2", - "lr_mult": "0.01" - } - }, - { - "op": "Activation", - "param": {"act_type": "relu"}, - "name": "relu2", - "inputs": [[7, 0]], - "backward_source_id": -1, - "attr": {"ctx_group": "stage2"} - }, - { - "op": "null", - "param": {}, - "name": "fc3_weight", - "inputs": [], - "backward_source_id": -1, - "attr": {"ctx_group": "stage2"} - }, - { - "op": "null", - "param": {}, - "name": "fc3_bias", - "inputs": [], - "backward_source_id": -1, - "attr": {"ctx_group": "stage2"} - }, - { - "op": "FullyConnected", - "param": { - "no_bias": "False", - "num_hidden": "10" - }, - "name": "fc3", - "inputs": [[8, 0], [9, 0], [10, 0]], - "backward_source_id": -1, - "attr": {"ctx_group": "stage2"} - }, - { - "op": "null", - "param": {}, - "name": "batchnorm0_gamma", - "inputs": [], - "backward_source_id": -1, - "attr": {"ctx_group": "stage2"} - }, - { - "op": "null", - "param": {}, - "name": "batchnorm0_beta", - "inputs": [], - "backward_source_id": -1, - "attr": {"ctx_group": "stage2"} - }, - { - "op": "BatchNorm", - "param": { - "eps": "0.001", - "fix_gamma": "True", - "momentum": "0.9", - "use_global_stats": "False" - }, - "name": "batchnorm0", - "inputs": [[11, 0], [12, 0], [13, 0]], - "backward_source_id": -1, - "attr": {"ctx_group": "stage2"} - }, - { - "op": "null", - "param": {}, - "name": "softmax_label", - "inputs": [], - "backward_source_id": -1, - "attr": {"ctx_group": "stage2"} - }, - { - "op": "SoftmaxOutput", - "param": { - "grad_scale": "1", - "ignore_label": "-1", - "multi_output": "False", - "normalization": "null", - "out_grad": "False", - "preserve_shape": "False", - "use_ignore": "False" - }, - "name": "softmax", - "inputs": [[14, 0], [15, 0]], - "backward_source_id": -1, - "attr": {"ctx_group": "stage2"} - } - ], - "arg_nodes": [0, 1, 2, 5, 6, 9, 10, 12, 13, 15], - "heads": [[16, 0]] -} \ No newline at end of file diff --git a/tests/python/unittest/test_contrib_svrg_module.py b/tests/python/unittest/test_contrib_svrg_module.py new file mode 100644 index 000000000000..70c718824aee --- /dev/null +++ b/tests/python/unittest/test_contrib_svrg_module.py @@ -0,0 +1,158 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import numpy as np +from common import with_seed, assertRaises +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule + + +def setup(): + train_data = np.random.randint(1, 5, [1000, 2]) + weights = np.array([1.0, 2.0]) + train_label = train_data.dot(weights) + + di = mx.io.NDArrayIter(train_data, train_label, batch_size=32, shuffle=True, label_name='lin_reg_label') + X = mx.sym.Variable('data') + Y = mx.symbol.Variable('lin_reg_label') + fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) + lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") + + mod = SVRGModule( + symbol=lro, + data_names=['data'], + label_names=['lin_reg_label'], update_freq=2) + mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) + mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, force_init=False, allow_extra=False) + + return mod + + +def test_bind_module(): + mod = setup() + assert mod.binded == True + assert mod._mod_aux.binded == True + + +def test_module_init(): + mod = setup() + assert mod._mod_aux != None + + +def test_module_initializer(): + def regression_model(m): + x = mx.symbol.var("data", stype='csr') + v = mx.symbol.var("v", shape=(m, 1), init=mx.init.Uniform(scale=.1), + stype='row_sparse') + model = mx.symbol.dot(lhs=x, rhs=v) + y = mx.symbol.Variable("label") + model = mx.symbol.LinearRegressionOutput(data=model, label=y, name="out") + return model + + #shape of the data + n, m = 128, 100 + model = regression_model(m) + + data = mx.nd.zeros(shape=(n, m), stype='csr') + label = mx.nd.zeros((n, 1)) + iterator = mx.io.NDArrayIter(data=data, label={'label': label}, + batch_size=n, last_batch_handle='discard') + + # create module + mod = SVRGModule(symbol=model, data_names=['data'], label_names=['label'], update_freq=2) + mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label) + mod.init_params() + v = mod._arg_params['v'] + assert v.stype == 'row_sparse' + assert np.sum(v.asnumpy()) != 0 + + +def test_module_bind(): + x = mx.sym.Variable("data") + net = mx.sym.FullyConnected(x, num_hidden=1) + + mod = SVRGModule(symbol=net, data_names=['data'], label_names=None, update_freq=2) + assertRaises(TypeError, mod.bind, data_shapes=['data', mx.nd.zeros(shape=(2, 1))]) + + mod.bind(data_shapes=[('data', (2, 1))]) + assert mod.binded == True + assert mod._mod_aux.binded == True + +@with_seed() +def test_module_save_load(): + x = mx.sym.Variable("data") + y = mx.sym.Variable("softmax_label") + net = mx.sym.FullyConnected(x, y, num_hidden=1) + + mod = SVRGModule(symbol=net, data_names=['data'], label_names=['softmax_label'], update_freq=2) + mod.bind(data_shapes=[('data', (1, 1))]) + mod.init_params() + mod.init_optimizer(optimizer='sgd', optimizer_params={'learning_rate': 0.1}) + mod.update() + mod.save_checkpoint('test', 0, save_optimizer_states=True) + + mod2 = SVRGModule.load('test', 0, load_optimizer_states=True, data_names=('data', )) + mod2.bind(data_shapes=[('data', (1, 1))]) + mod2.init_optimizer(optimizer_params={'learning_rate': 0.1}) + assert mod._symbol.tojson() == mod2._symbol.tojson() + + + # Multi-device + mod3 = SVRGModule(symbol=net, data_names=['data'], label_names=['softmax_label'], update_freq=3, + context=[mx.cpu(0), mx.cpu(1)]) + mod3.bind(data_shapes=[('data', (10, 10))]) + mod3.init_params() + mod3.init_optimizer(optimizer_params={'learning_rate': 1.0}) + mod3.update() + mod3.save_checkpoint('test', 0, save_optimizer_states=True) + + mod4 = SVRGModule.load('test', 0, load_optimizer_states=True, data_names=('data', )) + mod4.bind(data_shapes=[('data', (10, 10))]) + mod4.init_optimizer(optimizer_params={'learning_rate': 1.0}) + assert mod3._symbol.tojson() == mod4._symbol.tojson() + +@with_seed() +def test_svrgmodule_reshape(): + data = mx.sym.Variable("data") + sym = mx.sym.FullyConnected(data=data, num_hidden=4, name='fc') + + dshape=(3, 4) + mod = SVRGModule(sym, data_names=["data"], label_names=None, context=[mx.cpu(0), mx.cpu(1)], update_freq=1) + mod.bind(data_shapes=[('data', dshape)]) + mod.init_params() + mod._mod_aux.init_params() + mod.init_optimizer(optimizer_params={"learning_rate": 1.0}) + + data_batch = mx.io.DataBatch(data=[mx.nd.ones(dshape)], label=None) + mod.forward(data_batch) + mod.backward([mx.nd.ones(dshape)]) + mod.update() + assert mod.get_outputs()[0].shape == dshape + + + dshape = (2, 4) + mod.reshape(data_shapes=[('data', dshape)]) + mod.forward(mx.io.DataBatch(data=[mx.nd.ones(dshape)], + label=None)) + mod.backward([mx.nd.ones(dshape)]) + mod.update() + assert mod.get_outputs()[0].shape == dshape + + +if __name__ == "__main__": + import nose + nose.runmodule() diff --git a/tests/python/unittest/test_contrib_svrg_optimizer.py b/tests/python/unittest/test_contrib_svrg_optimizer.py new file mode 100644 index 000000000000..fc89b2933a29 --- /dev/null +++ b/tests/python/unittest/test_contrib_svrg_optimizer.py @@ -0,0 +1,101 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import mxnet as mx +from mxnet.test_utils import same +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule +from mxnet.contrib.svrg_optimization.svrg_optimizer import SVRGOptimizer + + +def create_network(): + + train_data = np.random.randint(1, 5, [1000, 2]) + weights = np.array([1.0, 2.0]) + train_label = train_data.dot(weights) + + batch_size = 32 + + di = mx.io.NDArrayIter(train_data, train_label, batch_size=batch_size, shuffle=True, label_name='lin_reg_label') + X = mx.sym.Variable('data') + Y = mx.symbol.Variable('lin_reg_label') + fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) + lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") + + mod = SVRGModule( + symbol=lro, + data_names=['data'], + label_names=['lin_reg_label'], update_freq=2 + ) + + mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) + mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, + force_init=False, allow_extra=False) + + return di, mod + + +def test_init_svrg_optimizer(): + _, mod = create_network() + + kv = mx.kv.create('local') + mod.init_optimizer(kvstore=kv, optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), + force_init=False) + + assert type(mod._optimizer).__name__ == SVRGOptimizer.__name__ + + +def test_svrg_optimizer_constructor(): + kv = mx.kv.create('local') + svrg_optimizer = SVRGOptimizer(default_optimizer='sgd', learning_rate=-1.0) + kv.set_optimizer(svrg_optimizer) + + assert svrg_optimizer.default_opt.lr == -1.0 + + +def test_kvstore_init_aux_keys(): + param_idx2name = {0: "weight", 1: "weight_full"} + + svrg_optimizer = SVRGOptimizer(default_optimizer='sgd', param_idx2name= param_idx2name, learning_rate=1.0) + kv = mx.kv.create('local') + kv.set_optimizer(svrg_optimizer) + + # Use default sgd optimizer + param_weight_init = mx.nd.array([0, 0, 0]) + param_weight_update = mx.nd.array([1, 1, 1]) + + kv.init(0, param_weight_init) + kv.push(0, param_weight_update) + kv.pull(0, param_weight_init) + + param_weight_full_init = mx.nd.array([1, 1, 1]) + param_weight_full_update = mx.nd.array([2, 2, 2]) + + # Use AssignmentOptimizer + kv.init(1, param_weight_full_init) + kv.push(1, param_weight_full_update) + kv.pull(1, param_weight_full_init) + + # updated weights using default sgd optimizer + same(param_weight_init.asnumpy(), np.array([-1, -1, -1])) + # updated with AssignmentOptimizer + same(param_weight_full_init.asnumpy(), np.array([2, 2, 2])) + + +if __name__ == "__main__": + import nose + nose.runmodule() \ No newline at end of file diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py index 39fcd81642d3..ef9beebe4b80 100644 --- a/tests/python/unittest/test_module.py +++ b/tests/python/unittest/test_module.py @@ -168,6 +168,7 @@ def test_module_layout(): assert mod.get_outputs()[0].shape == dshape hdshape = (3, 4, 7) + for x in mod.get_outputs(merge_multi_context=False)[0]: assert x.shape == hdshape