From 9ecc67325d16d6235357b03d67e2b9ceb98ba131 Mon Sep 17 00:00:00 2001 From: Stephanie Jingyi Yuan Date: Tue, 28 Aug 2018 21:47:25 -0400 Subject: [PATCH] SVRG optimization in python/contrib package, this version supports single machine single cpu, single gpu and multi-gpus --- .../svrg_optimization_python/src/__init__.py | 0 .../tests/__init__.py | 21 ---- .../tests/test_svrg_module.py | 116 ------------------ .../tests/test_svrg_optimizer.py | 96 --------------- .../svrg_module}/benchmarks/benchmark1.png | Bin .../svrg_module}/benchmarks/benchmark2.png | Bin example/svrg_module/common.py | 78 ++++++++++++ example/svrg_module/data_reader.py | 44 +++++++ .../svrg_module/example_api_train.py | 10 +- .../svrg_module/example_inference.py | 9 +- example/svrg_module/train.py | 44 +++++++ .../contrib/svrg_optimization}/README.md | 2 +- .../contrib/svrg_optimization}/__init__.py | 5 +- .../contrib/svrg_optimization}/svrg_module.py | 22 ++-- .../svrg_optimization}/svrg_optimizer.py | 1 + .../unittest/test_contrib_svrg_module.py | 86 +++++++++++++ .../unittest/test_contrib_svrg_optimizer.py | 101 +++++++++++++++ 17 files changed, 379 insertions(+), 256 deletions(-) delete mode 100644 contrib/svrg_optimization_python/src/__init__.py delete mode 100644 contrib/svrg_optimization_python/tests/__init__.py delete mode 100644 contrib/svrg_optimization_python/tests/test_svrg_module.py delete mode 100644 contrib/svrg_optimization_python/tests/test_svrg_optimizer.py rename {contrib/svrg_optimization_python => example/svrg_module}/benchmarks/benchmark1.png (100%) rename {contrib/svrg_optimization_python => example/svrg_module}/benchmarks/benchmark2.png (100%) create mode 100644 example/svrg_module/common.py create mode 100644 example/svrg_module/data_reader.py rename contrib/svrg_optimization_python/test_svrg_train.py => example/svrg_module/example_api_train.py (90%) rename contrib/svrg_optimization_python/test_svrg_inference.py => example/svrg_module/example_inference.py (92%) create mode 100644 example/svrg_module/train.py rename {contrib/svrg_optimization_python => python/mxnet/contrib/svrg_optimization}/README.md (98%) rename {contrib/svrg_optimization_python => python/mxnet/contrib/svrg_optimization}/__init__.py (86%) rename {contrib/svrg_optimization_python/src => python/mxnet/contrib/svrg_optimization}/svrg_module.py (97%) rename {contrib/svrg_optimization_python/src => python/mxnet/contrib/svrg_optimization}/svrg_optimizer.py (99%) create mode 100644 tests/python/unittest/test_contrib_svrg_module.py create mode 100644 tests/python/unittest/test_contrib_svrg_optimizer.py diff --git a/contrib/svrg_optimization_python/src/__init__.py b/contrib/svrg_optimization_python/src/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/contrib/svrg_optimization_python/tests/__init__.py b/contrib/svrg_optimization_python/tests/__init__.py deleted file mode 100644 index b7a3e645e0d5..000000000000 --- a/contrib/svrg_optimization_python/tests/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from __future__ import absolute_import -from ..src.svrg_module import SVRGModule -from ..src.svrg_optimizer import SVRGOptimizer - diff --git a/contrib/svrg_optimization_python/tests/test_svrg_module.py b/contrib/svrg_optimization_python/tests/test_svrg_module.py deleted file mode 100644 index 5118ae1656fb..000000000000 --- a/contrib/svrg_optimization_python/tests/test_svrg_module.py +++ /dev/null @@ -1,116 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -from ..src.svrg_module import SVRGModule -import mxnet as mx -import numpy as np - - -class TestSVRGModule(unittest.TestCase): - def setUp(self): - mx.random.seed(42) - train_data = np.random.randint(1, 5, [1000, 2]) - weights = np.array([1.0, 2.0]) - train_label = train_data.dot(weights) - - self.di = mx.io.NDArrayIter(train_data, train_label, batch_size=32, shuffle=True, label_name='lin_reg_label') - X = mx.sym.Variable('data') - Y = mx.symbol.Variable('lin_reg_label') - fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) - lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") - - self.mod = SVRGModule( - symbol=lro, - data_names=['data'], - label_names=['lin_reg_label'], update_freq=2) - self.mod.bind(data_shapes=self.di.provide_data, label_shapes=self.di.provide_label) - self.mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, - force_init=False, allow_extra=False) - - def test_create_module(self): - self.assertTrue(self.mod._mod_aux is not None) - - def test_bind_module(self): - self.assertTrue(self.mod.binded) - self.assertTrue(self.mod._mod_aux.binded) - - def test_module_initializer(self): - def regression_model(m): - x = mx.symbol.var("data", stype='csr') - v = mx.symbol.var("v", shape=(m, 1), init=mx.init.Uniform(scale=.1), - stype='row_sparse') - model = mx.symbol.dot(lhs=x, rhs=v) - y = mx.symbol.Variable("label") - model = mx.symbol.LinearRegressionOutput(data=model, label=y, name="out") - return model - - n, m = 128, 100 - model = regression_model(m) - - data = mx.nd.zeros(shape=(n, m), stype='csr') - label = mx.nd.zeros((n, 1)) - iterator = mx.io.NDArrayIter(data=data, label={'label': label}, - batch_size=n, last_batch_handle='discard') - - # create module - mod = SVRGModule(symbol=model, data_names=['data'], label_names=['label'], update_freq=2) - mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label) - mod.init_params() - v = mod._arg_params['v'] - self.assertEqual(v.stype, 'row_sparse') - self.assertTrue(np.sum(v.asnumpy()) != 0) - - @unittest.skip("SVRGModule with Pure SGD will not be a release feature") - def test_svrg_calculations(self): - def calc_svrg_optimization(update_freq): - mx.random.seed(42) - train_data = np.random.randint(1, 5, [1000, 2]) - weights = np.array([1.0, 2.0]) - train_label = train_data.dot(weights) - - di = mx.io.NDArrayIter(train_data, train_label, batch_size=32, shuffle=True, label_name='lin_reg_label') - X = mx.sym.Variable('data') - Y = mx.symbol.Variable('lin_reg_label') - fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) - lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") - - mod = SVRGModule( - symbol=lro, - data_names=['data'], - label_names=['lin_reg_label'], update_freq=update_freq) - mod.bind(data_shapes=self.di.provide_data, label_shapes=self.di.provide_label) - mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, force_init=False, allow_extra=False) - mod.init_optimizer(optimizer='sgd', optimizer_params=(('learning_rate', 0.01),)) - num_epoch = 100 - - metrics = mx.metric.create("mse") - for e in range(1, num_epoch + 1): - if e % (mod.update_freq + 1) == 0: - mod.update_full_grads(di) - di.reset() - metrics.reset() - for batch in di: - mod.forward_backward(data_batch=batch) - mod.update() - mod.update_metric(metrics, batch.label) - return metrics.get()[1] - - svrg_mse = calc_svrg_optimization(update_freq=2) - sgd_mse = calc_svrg_optimization(update_freq=101) - - self.assertTrue(svrg_mse - sgd_mse < 0) diff --git a/contrib/svrg_optimization_python/tests/test_svrg_optimizer.py b/contrib/svrg_optimization_python/tests/test_svrg_optimizer.py deleted file mode 100644 index 36d44f64b758..000000000000 --- a/contrib/svrg_optimization_python/tests/test_svrg_optimizer.py +++ /dev/null @@ -1,96 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -import unittest -from ..src.svrg_optimizer import SVRGOptimizer -from ..src.svrg_module import SVRGModule -import mxnet as mx -import numpy as np -from numpy.testing import assert_array_equal - - -class TestSVRGOPtimizer(unittest.TestCase): - @staticmethod - def create_network(): - mx.random.seed(42) - train_data = np.random.randint(1, 5, [1000, 2]) - weights = np.array([1.0, 2.0]) - train_label = train_data.dot(weights) - - batch_size = 32 - - di = mx.io.NDArrayIter(train_data, train_label, batch_size=batch_size, shuffle=True, label_name='lin_reg_label') - X = mx.sym.Variable('data') - Y = mx.symbol.Variable('lin_reg_label') - fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) - lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") - - mod = SVRGModule( - symbol=lro, - data_names=['data'], - label_names=['lin_reg_label'], update_freq=2 - ) - - mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) - mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, - force_init=False, allow_extra=False) - - return di, mod - - def test_init_svrg_optimizer(self): - di, mod = self.create_network() - - kv = mx.kv.create('local') - mod.init_optimizer(kvstore=kv, optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), - force_init=False) - - self.assertEqual(type(mod._optimizer).__name__, SVRGOptimizer.__name__) - - def test_svrg_optimizer_constructor(self): - _, mod = self.create_network() - - kv = mx.kv.create('local') - svrg_optimizer = SVRGOptimizer(default_optimizer='sgd', learning_rate=1.0) - kv.set_optimizer(svrg_optimizer) - - self.assertEqual(svrg_optimizer.default_opt.lr, 1.0) - - def test_kvstore_init_aux_keys(self): - param_idx2name= {0: "weight", 1: "weight_full"} - - svrg_optimizer = SVRGOptimizer(default_optimizer='sgd', param_idx2name= param_idx2name, learning_rate=1.0) - kv = mx.kv.create('local') - kv.set_optimizer(svrg_optimizer) - - param_weight_init = mx.nd.array([0, 0, 0]) - param_weight_update = mx.nd.array([1, 1, 1]) - - kv.init(0, param_weight_init) - kv.push(0, param_weight_update) - kv.pull(0, param_weight_init) - - param_weight_full_init = mx.nd.array([1, 1, 1]) - param_weight_full_update = mx.nd.array([2, 2, 2]) - - # Use AssignmentOptimizer - kv.init(1, param_weight_full_init) - kv.push(1, param_weight_full_update) - kv.pull(1, param_weight_full_init) - - assert_array_equal(param_weight_init.asnumpy(), np.array([-1, -1, -1])) - assert_array_equal(param_weight_full_init.asnumpy(), np.array([2, 2, 2])) diff --git a/contrib/svrg_optimization_python/benchmarks/benchmark1.png b/example/svrg_module/benchmarks/benchmark1.png similarity index 100% rename from contrib/svrg_optimization_python/benchmarks/benchmark1.png rename to example/svrg_module/benchmarks/benchmark1.png diff --git a/contrib/svrg_optimization_python/benchmarks/benchmark2.png b/example/svrg_module/benchmarks/benchmark2.png similarity index 100% rename from contrib/svrg_optimization_python/benchmarks/benchmark2.png rename to example/svrg_module/benchmarks/benchmark2.png diff --git a/example/svrg_module/common.py b/example/svrg_module/common.py new file mode 100644 index 000000000000..ac630fe6a684 --- /dev/null +++ b/example/svrg_module/common.py @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import mxnet as mx +import logging +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule + + +def create_lin_reg_network(train_features, train_labels, feature_dim, batch_size, update_freq, ctx, logger): + # fit a linear regression model with mxnet SVRG + print("Fitting linear regression with mxnet") + train_iter = mx.io.NDArrayIter(train_features, train_labels, batch_size=batch_size, shuffle=True, + data_name='data', label_name='label') + data = mx.sym.Variable("data") + label = mx.sym.Variable("label") + weight = mx.sym.Variable("fc_weight", shape=(1, feature_dim)) + net = mx.sym.dot(data, weight.transpose()) + bias = mx.sym.Variable("fc_bias", shape=(1,), wd_mult=0.0, lr_mult=10.0) + net = mx.sym.broadcast_plus(net, bias) + net = mx.sym.LinearRegressionOutput(data=net, label=label) + + mod = SVRGModule(symbol=net, context=ctx, data_names=['data'], label_names=['label'], logger=logger, + update_freq=update_freq) + return train_iter, mod + + +def create_metrics(metrics): + metric = mx.metric.create(metrics) + return metric + + +def create_logger(): + logger = logging.getLogger('sgd_svrg') + logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(message)s') + fh = logging.FileHandler('experiments_lr.log') + fh.setFormatter(formatter) + logger.addHandler(fh) + return logger + + +def accumulate_grad(grad_dict, mod): + param_names = mod._exec_group.param_names + for i in range(len(param_names)): + if param_names[i] not in grad_dict: + grad_dict[param_names[i]] = mod._exec_group.grad_arrays[i][0].copy() + else: + grad_dict[param_names[i]] = mx.ndarray.concat(grad_dict[param_names[i]], mod._exec_group.grad_arrays[i][0], + dim=0) + + +def calc_expectation(grad_dict, count): + for key in grad_dict.keys(): + grad_dict[str.format(key+"_expectation")] = mx.ndarray.sum(grad_dict[key], axis=0)/count + + return grad_dict + + +def calc_variance(grad_dict, count, param_names): + for i in range(len(param_names)): + diff_sqr = mx.ndarray.square(mx.nd.subtract(grad_dict[param_names[i]], + grad_dict[str.format(param_names[i]+"_expectation")])) + grad_dict[str.format(param_names[i] + "_variance")] = mx.ndarray.sum(diff_sqr, axis=0) / count diff --git a/example/svrg_module/data_reader.py b/example/svrg_module/data_reader.py new file mode 100644 index 000000000000..c4edca9b10ad --- /dev/null +++ b/example/svrg_module/data_reader.py @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import numpy as np + + +def read_year_prediction_data(fileName): + # Download data file + # from subprocess import call + # call(['wget', 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/YearPredictionMSD.bz2']) + # call(['bzip2', '-d', 'YearPredictionMSD.bz2']) + + from sklearn.datasets import load_svmlight_file + + feature_dim = 90 + print("Reading data from disk...") + train_features, train_labels = load_svmlight_file(fileName, n_features=feature_dim, dtype=np.float32) + train_features = train_features.todense() + + # normalize the data: subtract means and divide by standard deviations + label_mean = train_labels.mean() + label_std = np.sqrt(np.square(train_labels - label_mean).mean()) + feature_means = train_features.mean(axis=0) + feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0)) + + train_features = (train_features - feature_means) / feature_stds + train_labels = (train_labels - label_mean) / label_std + + return feature_dim, train_features, train_labels diff --git a/contrib/svrg_optimization_python/test_svrg_train.py b/example/svrg_module/example_api_train.py similarity index 90% rename from contrib/svrg_optimization_python/test_svrg_train.py rename to example/svrg_module/example_api_train.py index 36e8ce448731..d5095dfd6013 100644 --- a/contrib/svrg_optimization_python/test_svrg_train.py +++ b/example/svrg_module/example_api_train.py @@ -18,7 +18,7 @@ import mxnet as mx import numpy as np -from src.svrg_module import SVRGModule +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule def test_svrg_intermediate_level_api(num_epoch): @@ -40,7 +40,7 @@ def test_svrg_intermediate_level_api(num_epoch): mod.forward_backward(data_batch=batch) mod.update() mod.update_metric(metrics, batch.label) - print('Epoch[%d] Time cost=%.3f', e, metrics.get()) + mod.logger.info('Epoch[%d] Train cost=%.3f', e, metrics.get()[1]) def test_svrg_high_level_api(num_epoch): @@ -52,11 +52,13 @@ def test_svrg_high_level_api(num_epoch): def create_network(): + import logging """Create a linear regression network for performing SVRG optimization. :return: an instance of mx.io.NDArrayIter :return: an instance of mx.mod.svrgmodule for performing SVRG optimization """ - mx.random.seed(42) + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) train_data = np.random.randint(1, 5, [1000, 2]) weights = np.array([1.0, 2.0]) train_label = train_data.dot(weights) @@ -70,7 +72,7 @@ def create_network(): mod = SVRGModule( symbol=lro, data_names=['data'], - label_names=['lin_reg_label'], update_freq=2 + label_names=['lin_reg_label'], update_freq=2, logger=logging ) return di, mod diff --git a/contrib/svrg_optimization_python/test_svrg_inference.py b/example/svrg_module/example_inference.py similarity index 92% rename from contrib/svrg_optimization_python/test_svrg_inference.py rename to example/svrg_module/example_inference.py index 0250cdec5899..994b95fd3f86 100644 --- a/contrib/svrg_optimization_python/test_svrg_inference.py +++ b/example/svrg_module/example_inference.py @@ -18,12 +18,14 @@ import mxnet as mx import numpy as np -from src.svrg_module import SVRGModule +import logging +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule def test_svrg_inference(num_epoch): train_iter, val_iter, mod = create_network() - mod.fit(train_iter, eval_data=val_iter, eval_metric='mse', optimizer='sgd', optimizer_params=(('learning_rate', 0.025),), + mod.fit(train_iter, eval_data=val_iter, eval_metric='mse', optimizer='sgd', + optimizer_params=(('learning_rate', 0.025),), num_epoch=num_epoch) def test_score(num_epoch): @@ -53,7 +55,8 @@ def create_network(): :return: an instance of mx.io.NDArrayIter :return: an instance of mx.mod.svrgmodule for performing SVRG optimization """ - mx.random.seed(42) + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) data = np.random.randint(1, 5, [1000, 2]) n_train = int(data.shape[0] * 0.8) weights = np.array([1.0, 2.0]) diff --git a/example/svrg_module/train.py b/example/svrg_module/train.py new file mode 100644 index 000000000000..6d5a6b71a16a --- /dev/null +++ b/example/svrg_module/train.py @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import argparse +import mxnet as mx +from common import create_lin_reg_network, create_logger +from data_reader import read_year_prediction_data + +parser = argparse.ArgumentParser() +parser.add_argument('-e', dest='epochs', help='number of epochs for training phase', type=int, required=True) +parser.add_argument('-f', dest="updateFreq", help="update frequency for SVRGModule", type=int, default=2, required=True) +parser.add_argument('-b', dest="batch_size", help="define the batch size for training", type=int, + default=100, required=False) +parser.add_argument('-m', dest='metrics', help="create eval metric", type=str, required=False) +parser.add_argument('--gpus', type=str, help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu') +parser.add_argument('--kv-store', type=str, default='local', help='key-value store type') + +args = parser.parse_args() +# devices for training +ctx = mx.cpu() if args.gpus is None or args.gpus == "" else [mx.gpu(int(i)) for i in args.gpus.split(',')] +logger = create_logger() +kv = mx.kvstore.create(args.kv_store) + +feature_dim, train_features, train_labels = read_year_prediction_data('YearPredictionMSD') +train_iter, mod = create_lin_reg_network(train_features, train_labels, feature_dim, args.batch_size, 2, + ctx, logger) + +mod.fit(train_iter, eval_metric='mse', optimizer='sgd', + optimizer_params=(('learning_rate', 0.025), ), num_epoch=args.epochs, kvstore=kv) diff --git a/contrib/svrg_optimization_python/README.md b/python/mxnet/contrib/svrg_optimization/README.md similarity index 98% rename from contrib/svrg_optimization_python/README.md rename to python/mxnet/contrib/svrg_optimization/README.md index cc3b6bc41357..9bcbc1494d38 100644 --- a/contrib/svrg_optimization_python/README.md +++ b/python/mxnet/contrib/svrg_optimization/README.md @@ -36,4 +36,4 @@ thus SGD needs to start with a small learning rate. The learning rate does not n therefore we can use a relatively larger learning rate. SGD with learning rate of (0.001, 0.0025) and SVRG with learning rate of (0.025) are benchmarked. Even though SVRG starts with a relatively large learning rate, it converges much faster than SGD in both cases. -This particular experiment result aligns with what was stated in the SVRG paper section 5. \ No newline at end of file +This particular experiment result aligns with what was stated in the SVRG paper section 5. \ No newline at end of file diff --git a/contrib/svrg_optimization_python/__init__.py b/python/mxnet/contrib/svrg_optimization/__init__.py similarity index 86% rename from contrib/svrg_optimization_python/__init__.py rename to python/mxnet/contrib/svrg_optimization/__init__.py index 4acf63ef7a13..35dfbb545f2b 100644 --- a/contrib/svrg_optimization_python/__init__.py +++ b/python/mxnet/contrib/svrg_optimization/__init__.py @@ -15,6 +15,5 @@ # specific language governing permissions and limitations # under the License. -from __future__ import absolute_import -from .src.svrg_optimizer import SVRGOptimizer -from .src.svrg_module import SVRGModule +from . import svrg_module +from . import svrg_optimizer \ No newline at end of file diff --git a/contrib/svrg_optimization_python/src/svrg_module.py b/python/mxnet/contrib/svrg_optimization/svrg_module.py similarity index 97% rename from contrib/svrg_optimization_python/src/svrg_module.py rename to python/mxnet/contrib/svrg_optimization/svrg_module.py index e587da00eb25..c732440e14b0 100644 --- a/contrib/svrg_optimization_python/src/svrg_module.py +++ b/python/mxnet/contrib/svrg_optimization/svrg_module.py @@ -21,7 +21,7 @@ import mxnet as mx import time import logging -from svrg_optimizer import SVRGOptimizer +from .svrg_optimizer import SVRGOptimizer from mxnet.module import Module @@ -62,7 +62,7 @@ class SVRGModule(Module): Examples -------- >>> # An example of declaring and using SVRGModule. - >>> mod = mod = SVRGModule(symbol=lro, data_names=['data'], label_names=['lin_reg_label'], update_freq=2) + >>> mod = SVRGModule(symbol=lro, data_names=['data'], label_names=['lin_reg_label'], update_freq=2) >>> mod.fit(di, eval_metric='mse', optimizer='sgd', optimizer_params=(('learning_rate', 0.025),), >>> num_epoch=num_epoch, kvstore='local') """ @@ -80,7 +80,8 @@ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',), if isinstance(update_freq, int): self.update_freq = update_freq else: - raise TypeError("update_freq must be an integer") + raise TypeError("update_freq in a SVRGModule must be an integer to represent the frequency for calculating " + "full gradients") self._mod_aux = mx.mod.Module(symbol, data_names, label_names, logger, context, work_load_list, fixed_param_names, state_names, group2ctxs, compression_params) @@ -92,7 +93,6 @@ def _reset_bind(self): super(SVRGModule, self)._reset_bind() self._mod_aux._reset_bind() - def reshape(self, data_shapes, label_shapes=None): super(SVRGModule, self).reshape(data_shapes, label_shapes=label_shapes) self._mod_aux.reshape(data_shapes, label_shapes=label_shapes) @@ -125,7 +125,7 @@ def init_optimizer(self, kvstore='local', optimizer='sgd', kvstore=kvstore, optimizer_params=optimizer_params) super(SVRGModule, self).init_optimizer(kvstore=kvstore, optimizer=svrg_optimizer, - optimizer_params=optimizer_params, force_init=force_init) + optimizer_params=optimizer_params, force_init=force_init) # Init additional keys for accumulating full grads in KVStore if self._kvstore: @@ -182,8 +182,7 @@ def _create_optimizer(self, optimizer, default_opt, kvstore, optimizer_params): return optimizer def bind(self, data_shapes, label_shapes=None, for_training=True, - inputs_need_grad=False, force_rebind=False, shared_module=None, - grad_req='write'): + inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req='write'): """Binds the symbols to construct executors for both two modules. This is necessary before one can perform computation with the SVRGModule. @@ -337,9 +336,9 @@ def _accumulate_kvstore(self, key, value): # Accumulate full gradients for current epochs self._kvstore.push(key + "_full", value) - self._kvstore._barrier() self._kvstore.pull(key + "_full", value) + self._allocate_gradients(key, value) def _allocate_gradients(self, key, value): @@ -533,7 +532,7 @@ def fit(self, train_data, eval_data=None, eval_metric='acc', self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) - print('Epoch[%d] Time cost=%.3f', epoch, eval_metric.get()) + self.logger.info('Epoch[%d] Time cost=%.3f', epoch, eval_metric.get()[1]) # sync aux params across devices arg_params, aux_params = self.get_params() @@ -551,7 +550,6 @@ def fit(self, train_data, eval_data=None, eval_metric='acc', batch_end_callback=eval_batch_end_callback, epoch=epoch) for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) - print('Epoch[%d] Validation-%s=%f', epoch, name, val) def prepare(self, data_batch, sparse_row_id_fn=None): """Prepares two modules for processing a data batch. @@ -577,5 +575,5 @@ def prepare(self, data_batch, sparse_row_id_fn=None): parameters from the kvstore, where the str key is the name of the param, and the value is the row id of the param to pull. """ - super(SVRGModule, self).prepare(data_batch, sparse_row_id_fn=sparse_row_id_fn) - self._mod_aux.prepare(data_batch=sparse_row_id_fn) + super(SVRGModule, self).prepare(data_batch, sparse_row_id_fn) + self._mod_aux.prepare(data_batch, sparse_row_id_fn) diff --git a/contrib/svrg_optimization_python/src/svrg_optimizer.py b/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py similarity index 99% rename from contrib/svrg_optimization_python/src/svrg_optimizer.py rename to python/mxnet/contrib/svrg_optimization/svrg_optimizer.py index bf9cca975cce..bc049faff952 100644 --- a/contrib/svrg_optimization_python/src/svrg_optimizer.py +++ b/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py @@ -26,6 +26,7 @@ class AssignmentOptimizer(mx.optimizer.Optimizer): def update(self, index, weight, grad, state): weight[:] = grad + @mx.optimizer.register class SVRGOptimizer(mx.optimizer.Optimizer): """SVRGOptimizer is a wrapper class for two optimizers: one for accumulating full gradients and the other diff --git a/tests/python/unittest/test_contrib_svrg_module.py b/tests/python/unittest/test_contrib_svrg_module.py new file mode 100644 index 000000000000..2a71f8a590a3 --- /dev/null +++ b/tests/python/unittest/test_contrib_svrg_module.py @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule +import mxnet as mx +import numpy as np + + +def set_up(): + train_data = np.random.randint(1, 5, [1000, 2]) + weights = np.array([1.0, 2.0]) + train_label = train_data.dot(weights) + + di = mx.io.NDArrayIter(train_data, train_label, batch_size=32, shuffle=True, label_name='lin_reg_label') + X = mx.sym.Variable('data') + Y = mx.symbol.Variable('lin_reg_label') + fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) + lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") + + mod = SVRGModule( + symbol=lro, + data_names=['data'], + label_names=['lin_reg_label'], update_freq=2) + mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) + mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, + force_init=False, allow_extra=False) + + return mod + + +def test_bind_module(): + mod = set_up() + assert mod.binded == True + assert mod._mod_aux.binded == True + + +def test_module_init(): + mod = set_up() + assert mod._mod_aux != None + + +def test_module_initializer(): + def regression_model(m): + x = mx.symbol.var("data", stype='csr') + v = mx.symbol.var("v", shape=(m, 1), init=mx.init.Uniform(scale=.1), + stype='row_sparse') + model = mx.symbol.dot(lhs=x, rhs=v) + y = mx.symbol.Variable("label") + model = mx.symbol.LinearRegressionOutput(data=model, label=y, name="out") + return model + + n, m = 128, 100 + model = regression_model(m) + + data = mx.nd.zeros(shape=(n, m), stype='csr') + label = mx.nd.zeros((n, 1)) + iterator = mx.io.NDArrayIter(data=data, label={'label': label}, + batch_size=n, last_batch_handle='discard') + + # create module + mod = SVRGModule(symbol=model, data_names=['data'], label_names=['label'], update_freq=2) + mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label) + mod.init_params() + v = mod._arg_params['v'] + assert v.stype == 'row_sparse' + assert np.sum(v.asnumpy()) != 0 + + +if __name__ == "__main__": + import nose + nose.runmodule() diff --git a/tests/python/unittest/test_contrib_svrg_optimizer.py b/tests/python/unittest/test_contrib_svrg_optimizer.py new file mode 100644 index 000000000000..bc42ed5991a8 --- /dev/null +++ b/tests/python/unittest/test_contrib_svrg_optimizer.py @@ -0,0 +1,101 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule +from mxnet.contrib.svrg_optimization.svrg_optimizer import SVRGOptimizer +import mxnet as mx +import numpy as np +from mxnet.test_utils import same + + +def create_network(): + mx.random.seed(42) + train_data = np.random.randint(1, 5, [1000, 2]) + weights = np.array([1.0, 2.0]) + train_label = train_data.dot(weights) + + batch_size = 32 + + di = mx.io.NDArrayIter(train_data, train_label, batch_size=batch_size, shuffle=True, label_name='lin_reg_label') + X = mx.sym.Variable('data') + Y = mx.symbol.Variable('lin_reg_label') + fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) + lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") + + mod = SVRGModule( + symbol=lro, + data_names=['data'], + label_names=['lin_reg_label'], update_freq=2 + ) + + mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) + mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, + force_init=False, allow_extra=False) + + return di, mod + + +def test_init_svrg_optimizer(): + di, mod = create_network() + + kv = mx.kv.create('local') + mod.init_optimizer(kvstore=kv, optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), + force_init=False) + + assert type(mod._optimizer).__name__ == SVRGOptimizer.__name__ + + +def test_svrg_optimizer_constructor(): + _, mod = create_network() + + kv = mx.kv.create('local') + svrg_optimizer = SVRGOptimizer(default_optimizer='sgd', learning_rate=-1.0) + kv.set_optimizer(svrg_optimizer) + + assert svrg_optimizer.default_opt.lr == -1.0 + + +def test_kvstore_init_aux_keys(): + param_idx2name= {0: "weight", 1: "weight_full"} + + svrg_optimizer = SVRGOptimizer(default_optimizer='sgd', param_idx2name= param_idx2name, learning_rate=1.0) + kv = mx.kv.create('local') + kv.set_optimizer(svrg_optimizer) + + param_weight_init = mx.nd.array([0, 0, 0]) + param_weight_update = mx.nd.array([1, 1, 1]) + + kv.init(0, param_weight_init) + kv.push(0, param_weight_update) + kv.pull(0, param_weight_init) + + param_weight_full_init = mx.nd.array([1, 1, 1]) + param_weight_full_update = mx.nd.array([2, 2, 2]) + + # Use AssignmentOptimizer + kv.init(1, param_weight_full_init) + kv.push(1, param_weight_full_update) + kv.pull(1, param_weight_full_init) + + same(param_weight_init.asnumpy(), np.array([-1, -1, -1])) + same(param_weight_full_init.asnumpy(), np.array([2, 2, 2])) + + +if __name__ == "__main__": + import nose + nose.runmodule() \ No newline at end of file