diff --git a/example/bayesian-methods/algos.py b/example/bayesian-methods/algos.py index f7b362070791..29ba3ec97d0b 100644 --- a/example/bayesian-methods/algos.py +++ b/example/bayesian-methods/algos.py @@ -14,13 +14,13 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +"""Create implementation of algorithms of HMC, stepHMC, SGD, SGLD and DistilledSGLD""" from __future__ import print_function +import time +import numpy import mxnet as mx import mxnet.ndarray as nd -import time -import logging -from utils import * +from utils import copy_param, get_executor, sample_test_regression, sample_test_acc def calc_potential(exe, params, label_name, noise_precision, prior_precision): @@ -35,6 +35,7 @@ def calc_potential(exe, params, label_name, noise_precision, prior_precision): def calc_grad(exe, exe_grads, params, X, Y, label_name=None, outgrad_f=None): + """Calculate gradient""" exe.copy_params_from(params) exe.arg_dict['data'][:] = X if outgrad_f is None: @@ -48,8 +49,8 @@ def calc_grad(exe, exe_grads, params, X, Y, label_name=None, outgrad_f=None): v.wait_to_read() -def step_HMC(exe, exe_params, exe_grads, label_key, noise_precision, prior_precision, L=10, - eps=1E-6): +def step_HMC(exe, exe_params, exe_grads, label_key, noise_precision, prior_precision, L=10, eps=1E-6): + """Generate the implementation of step HMC""" init_params = {k: v.copyto(v.context) for k, v in exe_params.items()} end_params = {k: v.copyto(v.context) for k, v in exe_params.items()} init_momentums = {k: mx.random.normal(0, 1, v.shape) for k, v in init_params.items()} @@ -102,6 +103,7 @@ def step_HMC(exe, exe_params, exe_grads, label_key, noise_precision, prior_preci def HMC(sym, data_inputs, X, Y, X_test, Y_test, sample_num, initializer=None, noise_precision=1 / 9.0, prior_precision=0.1, learning_rate=1E-6, L=10, dev=mx.gpu()): + """Generate the implementation of HMC""" label_key = list(set(data_inputs.keys()) - set(['data']))[0] exe, exe_params, exe_grads, _ = get_executor(sym, dev, data_inputs, initializer) exe.arg_dict['data'][:] = X @@ -134,6 +136,7 @@ def SGD(sym, data_inputs, X, Y, X_test, Y_test, total_iter_num, out_grad_f=None, initializer=None, minibatch_size=100, dev=mx.gpu()): + """Generate the implementation of SGD""" if out_grad_f is None: label_key = list(set(data_inputs.keys()) - set(['data']))[0] exe, params, params_grad, _ = get_executor(sym, dev, data_inputs, initializer) @@ -173,6 +176,7 @@ def SGLD(sym, X, Y, X_test, Y_test, total_iter_num, initializer=None, minibatch_size=100, thin_interval=100, burn_in_iter_num=1000, task='classification', dev=mx.gpu()): + """Generate the implementation of SGLD""" if out_grad_f is None: label_key = list(set(data_inputs.keys()) - set(['data']))[0] exe, params, params_grad, _ = get_executor(sym, dev, data_inputs, initializer) @@ -200,7 +204,7 @@ def SGLD(sym, X, Y, X_test, Y_test, total_iter_num, if i < burn_in_iter_num: continue else: - if 0 == (i - burn_in_iter_num) % thin_interval: + if (i - burn_in_iter_num) % thin_interval == 0: if optimizer.lr_scheduler is not None: lr = optimizer.lr_scheduler(optimizer.num_update) else: @@ -238,6 +242,7 @@ def DistilledSGLD(teacher_sym, student_sym, minibatch_size=100, task='classification', dev=mx.gpu()): + """Generate the implementation of DistilledSGLD""" teacher_exe, teacher_params, teacher_params_grad, _ = \ get_executor(teacher_sym, dev, teacher_data_inputs, teacher_initializer) student_exe, student_params, student_params_grad, _ = \ @@ -323,13 +328,14 @@ def DistilledSGLD(teacher_sym, student_sym, sample_test_acc(teacher_exe, X=X, Y=Y, label_num=10, minibatch_size=minibatch_size) print("Student: Test ACC %d/%d=%f, Train ACC %d/%d=%f" % (test_correct, test_total, - test_acc, train_correct, train_total, train_acc)) + test_acc, train_correct, + train_total, train_acc)) print("Teacher: Test ACC %d/%d=%f, Train ACC %d/%d=%f" \ % (teacher_test_correct, teacher_test_total, teacher_test_acc, teacher_train_correct, teacher_train_total, teacher_train_acc)) else: print("Current Iter Num: %d" % (i + 1), "Time Spent: %f" % (end - start), "MSE:", - sample_test_regression(exe=student_exe, X=X_test, Y=Y_test, + sample_test_regression(exe=student_exe, X=X_test, Y=Y_test, minibatch_size=minibatch_size, save_path='regression_DSGLD.txt')) start = time.time() diff --git a/example/bayesian-methods/bdk_demo.py b/example/bayesian-methods/bdk_demo.py index cd39bfd2a7c9..83a43192b1ee 100644 --- a/example/bayesian-methods/bdk_demo.py +++ b/example/bayesian-methods/bdk_demo.py @@ -14,21 +14,21 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +"""Run Stochastic Gradient Langevin Dynamics (SGLD) and Bayesian Dark Knowledge (BDK)""" from __future__ import print_function -import mxnet as mx -import mxnet.ndarray as nd +import argparse +import time import numpy -import logging import matplotlib.pyplot as plt -from scipy.stats import gaussian_kde -import argparse -from algos import * -from data_loader import * -from utils import * +import mxnet as mx +import mxnet.ndarray as nd +from algos import HMC, SGD, SGLD, DistilledSGLD +from data_loader import load_mnist, load_toy, load_synthetic +from utils import BiasXavier, SGLDScheduler class CrossEntropySoftmax(mx.operator.NumpyOp): + """Calculate CrossEntropy softmax function""" def __init__(self): super(CrossEntropySoftmax, self).__init__(False) @@ -58,6 +58,7 @@ def backward(self, out_grad, in_data, out_data, in_grad): class LogSoftmax(mx.operator.NumpyOp): + """Generate helper functions to evaluate softmax loss function""" def __init__(self): super(LogSoftmax, self).__init__(False) @@ -103,6 +104,7 @@ def regression_student_grad(student_outputs, teacher_pred, teacher_noise_precisi def get_mnist_sym(output_op=None, num_hidden=400): + """Get symbol of mnist""" net = mx.symbol.Variable('data') net = mx.symbol.FullyConnected(data=net, name='mnist_fc1', num_hidden=num_hidden) net = mx.symbol.Activation(data=net, name='mnist_relu1', act_type="relu") @@ -117,6 +119,7 @@ def get_mnist_sym(output_op=None, num_hidden=400): def synthetic_grad(X, theta, sigma1, sigma2, sigmax, rescale_grad=1.0, grad=None): + """Get synthetic gradient value""" if grad is None: grad = nd.empty(theta.shape, theta.context) theta1 = theta.asnumpy()[0] @@ -128,17 +131,16 @@ def synthetic_grad(X, theta, sigma1, sigma2, sigmax, rescale_grad=1.0, grad=None -(X - theta1 - theta2) ** 2 / (2 * vx)) grad_npy = numpy.zeros(theta.shape) grad_npy[0] = -rescale_grad * ((numpy.exp(-(X - theta1) ** 2 / (2 * vx)) * (X - theta1) / vx - + numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * ( - X - theta1 - theta2) / vx) / denominator).sum() \ - + theta1 / v1 - grad_npy[1] = -rescale_grad * ((numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * ( - X - theta1 - theta2) / vx) / denominator).sum() \ - + theta2 / v2 + + numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * + (X - theta1 - theta2) / vx) / denominator).sum() + theta1 / v1 + grad_npy[1] = -rescale_grad * ((numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * + (X - theta1 - theta2) / vx) / denominator).sum() + theta2 / v2 grad[:] = grad_npy return grad def get_toy_sym(teacher=True, teacher_noise_precision=None): + """Get toy symbol""" if teacher: net = mx.symbol.Variable('data') net = mx.symbol.FullyConnected(data=net, name='teacher_fc1', num_hidden=100) @@ -160,8 +162,9 @@ def dev(gpu_id=None): return mx.gpu(gpu_id) if gpu_id else mx.cpu() -def run_mnist_SGD(training_num=50000, gpu_id=None): - X, Y, X_test, Y_test = load_mnist(training_num) + +def run_mnist_SGD(num_training=50000, gpu_id=None): + X, Y, X_test, Y_test = load_mnist(num_training) minibatch_size = 100 net = get_mnist_sym() data_shape = (minibatch_size,) + X.shape[1::] @@ -175,8 +178,8 @@ def run_mnist_SGD(training_num=50000, gpu_id=None): lr=5E-6, prior_precision=1.0, minibatch_size=100) -def run_mnist_SGLD(training_num=50000, gpu_id=None): - X, Y, X_test, Y_test = load_mnist(training_num) +def run_mnist_SGLD(num_training=50000, gpu_id=None): + X, Y, X_test, Y_test = load_mnist(num_training) minibatch_size = 100 net = get_mnist_sym() data_shape = (minibatch_size,) + X.shape[1::] @@ -191,10 +194,11 @@ def run_mnist_SGLD(training_num=50000, gpu_id=None): thin_interval=100, burn_in_iter_num=1000) -def run_mnist_DistilledSGLD(training_num=50000, gpu_id=None): - X, Y, X_test, Y_test = load_mnist(training_num) +def run_mnist_DistilledSGLD(num_training=50000, gpu_id=None): + """Run DistilledSGLD on mnist dataset""" + X, Y, X_test, Y_test = load_mnist(num_training) minibatch_size = 100 - if training_num >= 10000: + if num_training >= 10000: num_hidden = 800 total_iter_num = 1000000 teacher_learning_rate = 1E-6 @@ -235,6 +239,7 @@ def run_mnist_DistilledSGLD(training_num=50000, gpu_id=None): def run_toy_SGLD(gpu_id=None): + """Run SGLD on toy dataset""" X, Y, X_test, Y_test = load_toy() minibatch_size = 1 teacher_noise_precision = 1.0 / 9.0 @@ -243,20 +248,26 @@ def run_toy_SGLD(gpu_id=None): data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)), 'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))} initializer = mx.init.Uniform(0.07) - exe, params, _ = \ - SGLD(sym=net, data_inputs=data_inputs, - X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=50000, - initializer=initializer, - learning_rate=1E-4, - # lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5), - prior_precision=0.1, - burn_in_iter_num=1000, - thin_interval=10, - task='regression', - minibatch_size=minibatch_size, dev=dev(gpu_id)) - - -def run_toy_DistilledSGLD(gpu_id=None): + exe, params, _ = SGLD(sym=net, + data_inputs=data_inputs, + X=X, + Y=Y, + X_test=X_test, + Y_test=Y_test, + total_iter_num=50000, + initializer=initializer, + learning_rate=1E-4, + # lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5), + prior_precision=0.1, + burn_in_iter_num=1000, + thin_interval=10, + task='regression', + minibatch_size=minibatch_size, + dev=dev(gpu_id)) # disable=unbalanced-tuple-unpacking + + +def run_toy_DistilledSGLD(gpu_id): + """Run DistilledSGLD on toy dataset""" X, Y, X_test, Y_test = load_toy() minibatch_size = 1 teacher_noise_precision = 1.0 @@ -288,6 +299,7 @@ def run_toy_DistilledSGLD(gpu_id=None): def run_toy_HMC(gpu_id=None): + """Run HMC on toy dataset""" X, Y, X_test, Y_test = load_toy() minibatch_size = Y.shape[0] noise_precision = 1 / 9.0 @@ -302,6 +314,7 @@ def run_toy_HMC(gpu_id=None): def run_synthetic_SGLD(): + """Run synthetic SGLD""" theta1 = 0 theta2 = 1 sigma1 = numpy.sqrt(10) @@ -322,14 +335,14 @@ def run_synthetic_SGLD(): grad = nd.empty((2,), mx.cpu()) samples = numpy.zeros((2, total_iter_num)) start = time.time() - for i in xrange(total_iter_num): + for i in range(total_iter_num): if (i + 1) % 100000 == 0: end = time.time() print("Iter:%d, Time spent: %f" % (i + 1, end - start)) start = time.time() ind = numpy.random.randint(0, X.shape[0]) - synthetic_grad(X[ind], theta, sigma1, sigma2, sigmax, rescale_grad= - X.shape[0] / float(minibatch_size), grad=grad) + synthetic_grad(X[ind], theta, sigma1, sigma2, sigmax, + rescale_grad=X.shape[0] / float(minibatch_size), grad=grad) updater('theta', grad, theta) samples[:, i] = theta.asnumpy() plt.hist2d(samples[0, :], samples[1, :], (200, 200), cmap=plt.cm.jet) @@ -354,18 +367,18 @@ def run_synthetic_SGLD(): args = parser.parse_args() training_num = args.training if args.dataset == 1: - if 0 == args.algorithm: + if args.algorithm == 0: run_mnist_SGD(training_num, gpu_id=args.gpu) - elif 1 == args.algorithm: + elif args.algorithm == 1: run_mnist_SGLD(training_num, gpu_id=args.gpu) else: run_mnist_DistilledSGLD(training_num, gpu_id=args.gpu) elif args.dataset == 0: - if 1 == args.algorithm: + if args.algorithm == 1: run_toy_SGLD(gpu_id=args.gpu) - elif 2 == args.algorithm: + elif args.algorithm == 2: run_toy_DistilledSGLD(gpu_id=args.gpu) - elif 3 == args.algorithm: + elif args.algorithm == 3: run_toy_HMC(gpu_id=args.gpu) else: run_synthetic_SGLD() diff --git a/example/bayesian-methods/data_loader.py b/example/bayesian-methods/data_loader.py index 92ca0cfb3a6c..a0e71bb8d746 100644 --- a/example/bayesian-methods/data_loader.py +++ b/example/bayesian-methods/data_loader.py @@ -14,14 +14,15 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +"""Create helper functions to load mnist dataset and toy dataset""" from __future__ import print_function -import numpy import os import ssl +import numpy def load_mnist(training_num=50000): + """Load mnist dataset""" data_path = os.path.join(os.path.dirname(os.path.realpath('__file__')), 'mnist.npz') if not os.path.isfile(data_path): from six.moves import urllib diff --git a/example/bayesian-methods/utils.py b/example/bayesian-methods/utils.py index a2744373e87d..b0ea1f37e6bb 100644 --- a/example/bayesian-methods/utils.py +++ b/example/bayesian-methods/utils.py @@ -14,11 +14,10 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +"""Generate helper functions to Stochastic Gradient Langevin Dynamics (SGLD) and Bayesian Dark Knowledge (BDK)""" +import numpy import mxnet as mx import mxnet.ndarray as nd -import numpy -import logging class BiasXavier(mx.initializer.Xavier): @@ -26,7 +25,9 @@ def _init_bias(self, _, arr): scale = numpy.sqrt(self.magnitude / arr.shape[0]) mx.random.uniform(-scale, scale, out=arr) + class SGLDScheduler(mx.lr_scheduler.LRScheduler): + """Create SGLDScheduler class""" def __init__(self, begin_rate, end_rate, total_iter_num, factor): super(SGLDScheduler, self).__init__() if factor >= 1.0: @@ -44,7 +45,9 @@ def __call__(self, num_update): self.count += 1 return self.base_lr + def get_executor(sym, ctx, data_inputs, initializer=None): + """Get executor to Stochastic Gradient Langevin Dynamics and/or Bayesian Dark Knowledge""" data_shapes = {k: v.shape for k, v in data_inputs.items()} arg_names = sym.list_arguments() aux_names = sym.list_auxiliary_states() @@ -62,14 +65,18 @@ def get_executor(sym, ctx, data_inputs, initializer=None): initializer(k, v) return exe, params, params_grad, aux_states + def copy_param(exe, new_param=None): + """Create copy of parameters""" if new_param is None: - new_param = {k: nd.empty(v.shape, ctx=mx.cpu()) for k,v in exe.arg_dict.items()} + new_param = {k: nd.empty(v.shape, ctx=mx.cpu()) for k, v in exe.arg_dict.items()} for k, v in new_param.items(): exe.arg_dict[k].copyto(v) return new_param + def sample_test_acc(exe, X, Y, sample_pool=None, label_num=None, minibatch_size=100): + """Generate sample test to evaluate accuracy""" if label_num is None: pred = numpy.zeros((X.shape[0],)).astype('float32') else: @@ -89,12 +96,12 @@ def sample_test_acc(exe, X, Y, sample_pool=None, label_num=None, minibatch_size= else: old_param = copy_param(exe) for sample in sample_pool: - if type(sample) is list: + if isinstance(sample, list): denominator += sample[0] else: denominator += 1.0 for sample in sample_pool: - if type(sample) is list: + if isinstance(sample, list): ratio = sample[0]/denominator param = sample[1] else: @@ -118,11 +125,12 @@ def sample_test_acc(exe, X, Y, sample_pool=None, label_num=None, minibatch_size= def sample_test_regression(exe, X, Y, sample_pool=None, minibatch_size=100, save_path="regression.txt"): + """Generate a sample test regression""" old_param = copy_param(exe) if sample_pool is not None: pred = numpy.zeros(Y.shape + (len(sample_pool),)) ratio = numpy.zeros((len(sample_pool),)) - if type(sample_pool[0]) is list: + if isinstance(sample_pool[0], list): denominator = sum(sample[0] for sample in sample_pool) for i, sample in enumerate(sample_pool): ratio[i] = sample[0]/float(denominator) @@ -130,7 +138,7 @@ def sample_test_regression(exe, X, Y, sample_pool=None, minibatch_size=100, save ratio[:] = 1.0/ Y.shape[0] iterator = mx.io.NDArrayIter(data=X, label=Y, batch_size=minibatch_size, shuffle=False) for i, sample in enumerate(sample_pool): - if type(sample) is list: + if isinstance(sample, list): sample_param = sample[1] else: sample_param = sample @@ -146,7 +154,7 @@ def sample_test_regression(exe, X, Y, sample_pool=None, minibatch_size=100, save curr_instance += batch_len mean = pred.mean(axis=2) var = pred.std(axis=2)**2 - #print numpy.concatenate((Y, mean), axis=1) + # print numpy.concatenate((Y, mean), axis=1) mse = numpy.square(Y.reshape((Y.shape[0], )) - mean.reshape((mean.shape[0], ))).mean() numpy.savetxt(save_path, numpy.concatenate((mean, var), axis=1)) else: @@ -157,15 +165,19 @@ def sample_test_regression(exe, X, Y, sample_pool=None, minibatch_size=100, save for batch in iterator: exe.arg_dict['data'][:] = batch.data[0] exe.forward(is_train=False) - mean_var[curr_instance:curr_instance + minibatch_size - batch.pad, 0] = exe.outputs[0].asnumpy()[:minibatch_size - batch.pad].flatten() - mean_var[curr_instance:curr_instance + minibatch_size - batch.pad, 1] = numpy.exp(exe.outputs[1].asnumpy())[:minibatch_size - batch.pad].flatten() + mean_var[curr_instance:curr_instance + minibatch_size - batch.pad, 0] =\ + exe.outputs[0].asnumpy()[:minibatch_size - batch.pad].flatten() + mean_var[curr_instance:curr_instance + minibatch_size - batch.pad, 1] = \ + numpy.exp(exe.outputs[1].asnumpy())[:minibatch_size - batch.pad].flatten() curr_instance += minibatch_size - batch.pad mse = numpy.square(Y.reshape((Y.shape[0],)) - mean_var[:, 0]).mean() numpy.savetxt(save_path, mean_var) exe.copy_params_from(old_param) return mse + def pred_test(testing_data, exe, param_list=None, save_path=""): + """Generate prediction on testset""" ret = numpy.zeros((testing_data.shape[0], 2)) if param_list is None: for i in range(testing_data.shape[0]): @@ -177,8 +189,8 @@ def pred_test(testing_data, exe, param_list=None, save_path=""): else: for i in range(testing_data.shape[0]): pred = numpy.zeros((len(param_list),)) - for j in range(len(param_list)): - exe.copy_params_from(param_list[j]) + for (j, param) in enumerate(param_list): + exe.copy_params_from(param) exe.arg_dict['data'][:] = testing_data[i, 0] exe.forward(is_train=False) pred[j] = exe.outputs[0].asnumpy() diff --git a/example/caffe/caffe_net.py b/example/caffe/caffe_net.py index 0459c901e1cb..6796fca5c6a8 100644 --- a/example/caffe/caffe_net.py +++ b/example/caffe/caffe_net.py @@ -14,64 +14,80 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +"""Generate helper functions to load Caffe into MXNet""" +import argparse import mxnet as mx from data import get_iterator -import argparse import train_model + def get_mlp(): - """ - multi-layer perceptron - """ + """Get multi-layer perceptron""" data = mx.symbol.Variable('data') - fc1 = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }") + fc1 = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', + prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }") act1 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}") - fc2 = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }") + fc2 = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', + prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }") act2 = mx.symbol.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}") - fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}") + fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', + prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}") if use_caffe_loss: label = mx.symbol.Variable('softmax_label') - mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}") + mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', + prototxt="layer{type:\"SoftmaxWithLoss\"}") else: mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax') return mlp + def get_lenet(): - """ - LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick + """LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner. "Gradient-based learning applied to document recognition." Proceedings of the IEEE (1998) """ data = mx.symbol.Variable('data') # first conv - conv1 = mx.symbol.CaffeOp(data_0=data, num_weight=2, prototxt="layer{type:\"Convolution\" convolution_param { num_output: 20 kernel_size: 5 stride: 1} }") + conv1 = mx.symbol.CaffeOp(data_0=data, num_weight=2, + prototxt="layer{type:\"Convolution\" " + "convolution_param { num_output: 20 kernel_size: 5 stride: 1} }") act1 = mx.symbol.CaffeOp(data_0=conv1, prototxt="layer{type:\"TanH\"}") - pool1 = mx.symbol.CaffeOp(data_0=act1, prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}") + pool1 = mx.symbol.CaffeOp(data_0=act1, + prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}") # second conv - conv2 = mx.symbol.CaffeOp(data_0=pool1, num_weight=2, prototxt="layer{type:\"Convolution\" convolution_param { num_output: 50 kernel_size: 5 stride: 1} }") + conv2 = mx.symbol.CaffeOp(data_0=pool1, num_weight=2, + prototxt="layer{type:\"Convolution\" " + "convolution_param { num_output: 50 kernel_size: 5 stride: 1} }") act2 = mx.symbol.CaffeOp(data_0=conv2, prototxt="layer{type:\"TanH\"}") - pool2 = mx.symbol.CaffeOp(data_0=act2, prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}") + pool2 = mx.symbol.CaffeOp(data_0=act2, + prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}") - fc1 = mx.symbol.CaffeOp(data_0=pool2, num_weight=2, prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 500} }") + fc1 = mx.symbol.CaffeOp(data_0=pool2, num_weight=2, + prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 500} }") act3 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}") # second fullc - fc2 = mx.symbol.CaffeOp(data_0=act3, num_weight=2, prototxt="layer{type:\"InnerProduct\"inner_product_param{num_output: 10} }") + fc2 = mx.symbol.CaffeOp(data_0=act3, num_weight=2, + prototxt="layer{type:\"InnerProduct\"inner_product_param{num_output: 10} }") if use_caffe_loss: label = mx.symbol.Variable('softmax_label') - lenet = mx.symbol.CaffeLoss(data=fc2, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}") + lenet = mx.symbol.CaffeLoss(data=fc2, label=label, grad_scale=1, name='softmax', + prototxt="layer{type:\"SoftmaxWithLoss\"}") else: lenet = mx.symbol.SoftmaxOutput(data=fc2, name='softmax') return lenet + def get_network_from_json_file(file_name): network = mx.sym.load(file_name) return network + def parse_args(): + """Parse the arguments + """ parser = argparse.ArgumentParser(description='train an image classifier on mnist') parser.add_argument('--network', type=str, default='lenet', help='the cnn to use (mlp | lenet | ') diff --git a/example/caffe/data.py b/example/caffe/data.py index 15276c423601..f6bbc0f0daf6 100644 --- a/example/caffe/data.py +++ b/example/caffe/data.py @@ -14,42 +14,44 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +"""Create the helper functions to mnist dataset for Caffe operators in MXNet""" import mxnet as mx from mxnet.test_utils import get_mnist_ubyte + def get_iterator(data_shape, use_caffe_data): + """Generate the iterator of mnist dataset""" def get_iterator_impl_mnist(args, kv): """return train and val iterators for mnist""" # download data get_mnist_ubyte() flat = False if len(data_shape) != 1 else True - train = mx.io.MNISTIter( - image = "data/train-images-idx3-ubyte", - label = "data/train-labels-idx1-ubyte", - input_shape = data_shape, - batch_size = args.batch_size, - shuffle = True, - flat = flat, - num_parts = kv.num_workers, - part_index = kv.rank) + train = mx.io.MNISTIter( + image="data/train-images-idx3-ubyte", + label="data/train-labels-idx1-ubyte", + input_shape=data_shape, + batch_size=args.batch_size, + shuffle=True, + flat=flat, + num_parts=kv.num_workers, + part_index=kv.rank) val = mx.io.MNISTIter( - image = "data/t10k-images-idx3-ubyte", - label = "data/t10k-labels-idx1-ubyte", - input_shape = data_shape, - batch_size = args.batch_size, - flat = flat, - num_parts = kv.num_workers, - part_index = kv.rank) + image="data/t10k-images-idx3-ubyte", + label="data/t10k-labels-idx1-ubyte", + input_shape=data_shape, + batch_size=args.batch_size, + flat=flat, + num_parts=kv.num_workers, + part_index=kv.rank) return (train, val) def get_iterator_impl_caffe(args, kv): flat = False if len(data_shape) != 1 else True train = mx.io.CaffeDataIter( - prototxt = + prototxt= 'layer { \ name: "mnist" \ type: "Data" \ @@ -67,13 +69,13 @@ def get_iterator_impl_caffe(args, kv): backend: LMDB \ } \ }', - flat = flat, - num_examples = 60000 + flat=flat, + num_examples=60000 # float32 is the default, so left out here in order to illustrate ) val = mx.io.CaffeDataIter( - prototxt = + prototxt= 'layer { \ name: "mnist" \ type: "Data" \ @@ -91,9 +93,9 @@ def get_iterator_impl_caffe(args, kv): backend: LMDB \ } \ }', - flat = flat, - num_examples = 10000, - dtype = "float32" # float32 is the default + flat=flat, + num_examples=10000, + dtype="float32" # float32 is the default ) return train, val diff --git a/example/caffe/train_model.py b/example/caffe/train_model.py index 4290e71063e8..16b18674fe7c 100644 --- a/example/caffe/train_model.py +++ b/example/caffe/train_model.py @@ -14,12 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - -import mxnet as mx -import logging +"""Train module with using Caffe operator in MXNet""" import os +import logging +import mxnet as mx + def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None): + """Train the model with using Caffe operator in MXNet""" # kvstore kv = mx.kvstore.create(args.kv_store) @@ -74,8 +76,8 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None): if 'lr_factor' in args and args.lr_factor < 1: model_args['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( - step = max(int(epoch_size * args.lr_factor_epoch), 1), - factor = args.lr_factor) + step=max(int(epoch_size * args.lr_factor_epoch), 1), + factor=args.lr_factor) if 'clip_gradient' in args and args.clip_gradient is not None: model_args['clip_gradient'] = args.clip_gradient @@ -85,12 +87,11 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None): args.gpus is None or len(args.gpus.split(',')) is 1): kv = None - mod = mx.mod.Module(network, context=devs) if eval_metrics is None: eval_metrics = ['accuracy'] - ## TopKAccuracy only allows top_k > 1 + # TopKAccuracy only allows top_k > 1 for top_k in [5, 10, 20]: eval_metrics.append(mx.metric.create('top_k_accuracy', top_k=top_k)) @@ -102,8 +103,7 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None): batch_end_callback.append(mx.callback.Speedometer(args.batch_size, 50)) mod.fit(train_data=train, eval_metric=eval_metrics, eval_data=val, optimizer='sgd', - optimizer_params={'learning_rate':args.lr, 'momentum': 0.9, 'wd': 0.00001}, - num_epoch=args.num_epochs, batch_end_callback=batch_end_callback, - initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), - kvstore=kv, epoch_end_callback=checkpoint, **model_args) - + optimizer_params={'learning_rate':args.lr, 'momentum': 0.9, 'wd': 0.00001}, + num_epoch=args.num_epochs, batch_end_callback=batch_end_callback, + initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), + kvstore=kv, epoch_end_callback=checkpoint, **model_args) diff --git a/example/capsnet/capsulelayers.py b/example/capsnet/capsulelayers.py index 5ac4fad49149..077a4003f7a9 100644 --- a/example/capsnet/capsulelayers.py +++ b/example/capsnet/capsulelayers.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +"""Create layers of capsule net""" import mxnet as mx @@ -41,8 +41,7 @@ def primary_caps(data, dim_vector, n_channels, kernel, strides, name=''): class CapsuleLayer: - """ - The capsule layer with dynamic routing. + """The capsule layer with dynamic routing. [batch_size, input_num_capsule, input_dim_vector] => [batch_size, num_capsule, dim_vector] """ @@ -98,7 +97,8 @@ def __call__(self, data): mx.sym.sum(mx.sym.broadcast_mul(c, inputs_hat_stopped, name='broadcast_mul_' + str(i)), axis=1, keepdims=True, name='sum_' + str(i)), name='output_' + str(i), squash_axis=4) - bias_ = bias_ + mx.sym.sum(mx.sym.broadcast_mul(c, inputs_hat_stopped, name='bias_broadcast_mul' + str(i)), + bias_ = bias_ + mx.sym.sum(mx.sym.broadcast_mul(c, inputs_hat_stopped, + name='bias_broadcast_mul' + str(i)), axis=4, keepdims=True, name='bias_' + str(i)) diff --git a/example/capsnet/capsulenet.py b/example/capsnet/capsulenet.py index 67108757bf39..05df9cdc56c4 100644 --- a/example/capsnet/capsulenet.py +++ b/example/capsnet/capsulenet.py @@ -14,24 +14,27 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import mxnet as mx -import numpy as np +"""Generate MXNet implementation of CapsNet""" import os import re import gzip import struct +import numpy as np import scipy.ndimage as ndi +import mxnet as mx from capsulelayers import primary_caps, CapsuleLayer from mxboard import SummaryWriter + def margin_loss(y_true, y_pred): loss = y_true * mx.sym.square(mx.sym.maximum(0., 0.9 - y_pred)) +\ 0.5 * (1 - y_true) * mx.sym.square(mx.sym.maximum(0., y_pred - 0.1)) return mx.sym.mean(data=mx.sym.sum(loss, 1)) -def capsnet(batch_size, n_class, num_routing,recon_loss_weight): +def capsnet(batch_size, n_class, num_routing, recon_loss_weight): + """Create CapsNet""" # data.shape = [batch_size, 1, 28, 28] data = mx.sym.Variable('data') @@ -107,7 +110,8 @@ def read_data(label_url, image_url): label = np.fromstring(flbl.read(), dtype=np.int8) with gzip.open(download_data(image_url), 'rb') as fimg: magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16)) - image = np.fromstring(fimg.read(), dtype=np.uint8).reshape(len(label), rows, cols) + image = np.fromstring(fimg.read(), dtype=np.uint8) + np.reshape(image, len(label), (rows, cols)) return label, image @@ -116,10 +120,11 @@ def to4d(img): class LossMetric(mx.metric.EvalMetric): - def __init__(self, batch_size, num_gpu): + """Evaluate the loss function""" + def __init__(self, batch_size, num_gpus): super(LossMetric, self).__init__('LossMetric') self.batch_size = batch_size - self.num_gpu = num_gpu + self.num_gpu = num_gpus self.sum_metric = 0 self.num_inst = 0 self.loss = 0.0 @@ -130,6 +135,7 @@ def __init__(self, batch_size, num_gpu): self.n_batch = 0 def update(self, labels, preds): + """Update the hyper-parameters and loss of CapsNet""" batch_sum_metric = 0 batch_num_inst = 0 for label, pred_outcaps in zip(labels[0], preds[0]): @@ -146,7 +152,7 @@ def update(self, labels, preds): self.batch_sum_metric = batch_sum_metric self.batch_num_inst = batch_num_inst self.batch_loss = batch_loss - self.n_batch += 1 + self.n_batch += 1 def get_name_value(self): acc = float(self.sum_metric)/float(self.num_inst) @@ -184,6 +190,7 @@ def __call__(self, num_update): def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay): + """Run training to CapsNet""" summary_writer = SummaryWriter(args.tblog_dir) lr_scheduler = SimpleLRScheduler(learning_rate) optimizer_params = {'lr_scheduler': lr_scheduler} @@ -218,7 +225,8 @@ def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, deca summary_writer.add_scalar('val_loss', val_loss, n_epoch) summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch) - print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss, train_recon_err)) + print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss, + train_recon_err)) print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, val_acc, val_loss, val_recon_err)) print('SAVE CHECKPOINT') @@ -227,10 +235,8 @@ def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, deca lr_scheduler.learning_rate = learning_rate * (decay ** n_epoch) -def apply_transform(x, - transform_matrix, - fill_mode='nearest', - cval=0.): +def apply_transform(x, transform_matrix, fill_mode='nearest', cval=0.): + """Apply transform on nd.array""" x = np.rollaxis(x, 0, 0) final_affine_matrix = transform_matrix[:2, :2] final_offset = transform_matrix[:2, 2] @@ -255,30 +261,45 @@ def random_shift(x, width_shift_fraction, height_shift_fraction): x = apply_transform(x, shift_matrix, 'nearest') return x + def _shuffle(data, idx): """Shuffle the data.""" shuffle_data = [] - for k, v in data: - shuffle_data.append((k, mx.ndarray.array(v.asnumpy()[idx], v.context))) + for idx_k, idx_v in data: + shuffle_data.append((idx_k, mx.ndarray.array(idx_v.asnumpy()[idx], idx_v.context))) return shuffle_data + class MNISTCustomIter(mx.io.NDArrayIter): - + """Create custom iterator of mnist dataset""" + def __init__(self, data, label, batch_size, shuffle): + self.data = data + self.label = label + self.batch_size = batch_size + self.shuffle = shuffle + self.cursor = None + def reset(self): + """Reset class MNISTCustomIter(mx.io.NDArrayIter):""" # shuffle data if self.is_train: np.random.shuffle(self.idx) self.data = _shuffle(self.data, self.idx) self.label = _shuffle(self.label, self.idx) + if self.last_batch_handle == 'roll_over' and self.cursor > self.num_data: - self.cursor = -self.batch_size + (self.cursor%self.num_data)%self.batch_size + self.cursor = -self.batch_size + (self.cursor % self.num_data) % self.batch_size else: self.cursor = -self.batch_size + def set_is_train(self, is_train): + """Set training flag""" self.is_train = is_train + def next(self): + """Generate next of iterator""" if self.iter_next(): if self.is_train: data_raw_list = self.getdata() @@ -288,8 +309,7 @@ def next(self): return mx.io.DataBatch(data=[mx.nd.array(data_shifted)], label=self.getlabel(), pad=self.getpad(), index=None) else: - return mx.io.DataBatch(data=self.getdata(), label=self.getlabel(), \ - pad=self.getpad(), index=None) + return mx.io.DataBatch(data=self.getdata(), label=self.getlabel(), pad=self.getpad(), index=None) else: raise StopIteration @@ -298,10 +318,9 @@ def next(self): if __name__ == "__main__": # Read mnist data set path = 'http://yann.lecun.com/exdb/mnist/' - (train_lbl, train_img) = read_data( - path + 'train-labels-idx1-ubyte.gz', path + 'train-images-idx3-ubyte.gz') - (val_lbl, val_img) = read_data( - path + 't10k-labels-idx1-ubyte.gz', path + 't10k-images-idx3-ubyte.gz') + (train_lbl, train_img) = read_data(path + 'train-labels-idx1-ubyte.gz', path + 'train-images-idx3-ubyte.gz') + (val_lbl, val_img) = read_data(path + 't10k-labels-idx1-ubyte.gz', path + 't10k-images-idx3-ubyte.gz') + # set batch size import argparse parser = argparse.ArgumentParser() @@ -331,10 +350,13 @@ def next(self): # generate train_iter, val_iter train_iter = MNISTCustomIter(data=to4d(train_img), label=train_lbl, batch_size=int(args.batch_size), shuffle=True) train_iter.set_is_train(True) - val_iter = MNISTCustomIter(data=to4d(val_img), label=val_lbl, batch_size=int(args.batch_size),) + val_iter = MNISTCustomIter(data=to4d(val_img), label=val_lbl, batch_size=int(args.batch_size), shuffle=True) val_iter.set_is_train(False) # define capsnet - final_net = capsnet(batch_size=int(args.batch_size/num_gpu), n_class=10, num_routing=args.num_routing, recon_loss_weight=args.recon_loss_weight) + final_net = capsnet(batch_size=int(args.batch_size/num_gpu), + n_class=10, + num_routing=args.num_routing, + recon_loss_weight=args.recon_loss_weight) # set metric loss_metric = LossMetric(args.batch_size/num_gpu, 1) @@ -343,5 +365,6 @@ def next(self): module.bind(data_shapes=train_iter.provide_data, label_shapes=val_iter.provide_label, for_training=True) + do_training(num_epoch=args.num_epoch, optimizer='adam', kvstore='device', learning_rate=args.lr, model_prefix=args.model_prefix, decay=args.decay) diff --git a/example/cnn_chinese_text_classification/data_helpers.py b/example/cnn_chinese_text_classification/data_helpers.py index b3a13deec771..49bb3d5dc275 100644 --- a/example/cnn_chinese_text_classification/data_helpers.py +++ b/example/cnn_chinese_text_classification/data_helpers.py @@ -14,6 +14,9 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. + +"""Help functions to support for implementing CNN + Highway Network for Chinese Text Classification in MXNet""" + import codecs import itertools import os @@ -27,8 +30,7 @@ def clean_str(string): - """ - Tokenization/string cleaning for all datasets except for SST. + """Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) @@ -40,27 +42,28 @@ def clean_str(string): string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) - string = re.sub(r"\(", " \( ", string) - string = re.sub(r"\)", " \) ", string) - string = re.sub(r"\?", " \? ", string) + string = re.sub(r"\(", r" \( ", string) + string = re.sub(r"\)", r" \) ", string) + string = re.sub(r"\?", r" \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower() def get_chinese_text(): + """Download the chinese_text dataset and unzip it""" if not os.path.isdir("data/"): os.system("mkdir data/") if (not os.path.exists('data/pos.txt')) or \ (not os.path.exists('data/neg')): - os.system("wget -q https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/chinese_text.zip -P data/") + os.system("wget -q https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/chinese_text.zip " + "-P data/") os.chdir("./data") os.system("unzip -u chinese_text.zip") os.chdir("..") def load_data_and_labels(): - """ - Loads MR polarity data from files, splits the data into words and generates labels. + """Loads MR polarity data from files, splits the data into words and generates labels. Returns split sentences and labels. """ # download dataset @@ -86,14 +89,14 @@ def load_data_and_labels(): def pad_sentences(sentences, padding_word=""): - """ - Pads all sentences to the same length. The length is defined by the longest sentence. + """Pads all sentences to the same length. The length is defined by the longest sentence. Returns padded sentences. """ sequence_length = max(len(x) for x in sentences) padded_sentences = [] - for i in range(len(sentences)): - sentence = sentences[i] + for i, element in enumerate(sentences): + print(i, element) + sentence = element num_padding = sequence_length - len(sentence) new_sentence = sentence + [padding_word] * num_padding padded_sentences.append(new_sentence) @@ -101,8 +104,7 @@ def pad_sentences(sentences, padding_word=""): def build_vocab(sentences): - """ - Builds a vocabulary mapping from word to index based on the sentences. + """Builds a vocabulary mapping from word to index based on the sentences. Returns vocabulary mapping and inverse vocabulary mapping. """ # Build vocabulary @@ -115,45 +117,41 @@ def build_vocab(sentences): def build_input_data(sentences, labels, vocabulary): - """ - Maps sentencs and labels to vectors based on a vocabulary. - """ + """Maps sentences and labels to vectors based on a vocabulary.""" x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences]) y = np.array(labels) return [x, y] -def build_input_data_with_word2vec(sentences, labels, word2vec): - """Map sentences and labels to vectors based on a pretrained word2vec""" +def build_input_data_with_word2vec(sentences, labels, word2vec_list): + """Map sentences and labels to vectors based on a pre-trained word2vec""" x_vec = [] for sent in sentences: vec = [] for word in sent: - if word in word2vec: - vec.append(word2vec[word]) + if word in word2vec_list: + vec.append(word2vec_list[word]) else: - vec.append(word2vec['']) + vec.append(word2vec_list['']) x_vec.append(vec) x_vec = np.array(x_vec) y_vec = np.array(labels) return [x_vec, y_vec] -def load_data_with_word2vec(word2vec): - """ - Loads and preprocessed data for the MR dataset. +def load_data_with_word2vec(word2vec_list): + """Loads and preprocessed data for the MR dataset. Returns input vectors, labels, vocabulary, and inverse vocabulary. """ # Load and preprocess data sentences, labels = load_data_and_labels() sentences_padded = pad_sentences(sentences) # vocabulary, vocabulary_inv = build_vocab(sentences_padded) - return build_input_data_with_word2vec(sentences_padded, labels, word2vec) + return build_input_data_with_word2vec(sentences_padded, labels, word2vec_list) def load_data(): - """ - Loads and preprocessed data for the MR dataset. + """Loads and preprocessed data for the MR dataset. Returns input vectors, labels, vocabulary, and inverse vocabulary. """ # Load and preprocess data @@ -165,9 +163,7 @@ def load_data(): def batch_iter(data, batch_size, num_epochs): - """ - Generates a batch iterator for a dataset. - """ + """Generates a batch iterator for a dataset.""" data = np.array(data) data_size = len(data) num_batches_per_epoch = int(len(data) / batch_size) + 1 @@ -182,18 +178,19 @@ def batch_iter(data, batch_size, num_epochs): def load_pretrained_word2vec(infile): + """Load the pre-trained word2vec from file.""" if isinstance(infile, str): infile = open(infile) - word2vec = {} + word2vec_list = {} for idx, line in enumerate(infile): if idx == 0: vocab_size, dim = line.strip().split() else: tks = line.strip().split() - word2vec[tks[0]] = map(float, tks[1:]) + word2vec_list[tks[0]] = map(float, tks[1:]) - return word2vec + return word2vec_list def load_google_word2vec(path): diff --git a/example/cnn_chinese_text_classification/text_cnn.py b/example/cnn_chinese_text_classification/text_cnn.py index 4598a52e6674..ce706813637a 100644 --- a/example/cnn_chinese_text_classification/text_cnn.py +++ b/example/cnn_chinese_text_classification/text_cnn.py @@ -20,12 +20,14 @@ # -*- coding: utf-8 -*- -import sys, os -import mxnet as mx -import numpy as np -import argparse +"""Implementing CNN + Highway Network for Chinese Text Classification in MXNet""" + +import os +import sys import logging -import time +import argparse +import numpy as np +import mxnet as mx from mxnet import random from mxnet.initializer import Xavier, Initializer @@ -63,12 +65,28 @@ def save_model(): + """Save cnn model + + Returns + ---------- + callback: A callback function that can be passed as epoch_end_callback to fit + """ if not os.path.exists("checkpoint"): os.mkdir("checkpoint") return mx.callback.do_checkpoint("checkpoint/checkpoint", args.save_period) def highway(data): + """Construct highway net + + Parameters + ---------- + data: + + Returns + ---------- + Highway Networks + """ _data = data high_weight = mx.sym.Variable('high_weight') high_bias = mx.sym.Variable('high_bias') @@ -85,20 +103,41 @@ def highway(data): def data_iter(batch_size, num_embed, pre_trained_word2vec=False): + """Construct data iter + + Parameters + ---------- + batch_size: int + num_embed: int + pre_trained_word2vec: boolean + identify the pre-trained layers or not + Returns + ---------- + train_set: DataIter + Train DataIter + valid: DataIter + Valid DataIter + sentences_size: int + array dimensions + embedded_size: int + array dimensions + vocab_size: int + array dimensions + """ logger.info('Loading data...') if pre_trained_word2vec: word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) - # reshpae for convolution input + # reshape for convolution input x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2])) - embed_size = x.shape[-1] - sentence_size = x.shape[2] - vocab_size = -1 + embedded_size = x.shape[-1] + sentences_size = x.shape[2] + vocabulary_size = -1 else: x, y, vocab, vocab_inv = data_helpers.load_data() - embed_size = num_embed - sentence_size = x.shape[1] - vocab_size = len(vocab) + embedded_size = num_embed + sentences_size = x.shape[1] + vocabulary_size = len(vocab) # randomly shuffle data np.random.seed(10) @@ -109,30 +148,55 @@ def data_iter(batch_size, num_embed, pre_trained_word2vec=False): # split train/valid set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] - logger.info('Train/Valid split: %d/%d' % (len(y_train), len(y_dev))) + logger.info('Train/Valid split: %d/%d', len(y_train), len(y_dev)) logger.info('train shape: %(shape)s', {'shape': x_train.shape}) logger.info('valid shape: %(shape)s', {'shape': x_dev.shape}) - logger.info('sentence max words: %(shape)s', {'shape': sentence_size}) - logger.info('embedding size: %(msg)s', {'msg': embed_size}) - logger.info('vocab size: %(msg)s', {'msg': vocab_size}) + logger.info('sentence max words: %(shape)s', {'shape': sentences_size}) + logger.info('embedding size: %(msg)s', {'msg': embedded_size}) + logger.info('vocab size: %(msg)s', {'msg': vocabulary_size}) - train = mx.io.NDArrayIter( + train_set = mx.io.NDArrayIter( x_train, y_train, batch_size, shuffle=True) valid = mx.io.NDArrayIter( x_dev, y_dev, batch_size) - return (train, valid, sentence_size, embed_size, vocab_size) + return train_set, valid, sentences_size, embedded_size, vocabulary_size -def sym_gen(batch_size, sentence_size, num_embed, vocab_size, - num_label=2, filter_list=[3, 4, 5], num_filter=100, +def sym_gen(batch_size, sentences_size, num_embed, vocabulary_size, + num_label=2, filter_list=None, num_filter=100, dropout=0.0, pre_trained_word2vec=False): + """Generate network symbol + + Parameters + ---------- + batch_size: int + sentences_size: int + num_embed: int + vocabulary_size: int + num_label: int + filter_list: list + num_filter: int + dropout: int + pre_trained_word2vec: boolean + identify the pre-trained layers or not + Returns + ---------- + sm: symbol + data: list of str + data names + softmax_label: list of str + label names + """ input_x = mx.sym.Variable('data') input_y = mx.sym.Variable('softmax_label') # embedding layer if not pre_trained_word2vec: - embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed, name='vocab_embed') - conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentence_size, num_embed)) + embed_layer = mx.sym.Embedding(data=input_x, + input_dim=vocabulary_size, + output_dim=num_embed, + name='vocab_embed') + conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentences_size, num_embed)) else: conv_input = input_x @@ -141,7 +205,7 @@ def sym_gen(batch_size, sentence_size, num_embed, vocab_size, for i, filter_size in enumerate(filter_list): convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter) relui = mx.sym.Activation(data=convi, act_type='relu') - pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1, 1)) + pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentences_size - filter_size + 1, 1), stride=(1, 1)) pooled_outputs.append(pooli) # combine all pooled outputs @@ -170,10 +234,27 @@ def sym_gen(batch_size, sentence_size, num_embed, vocab_size, return sm, ('data',), ('softmax_label',) -def train(symbol, train_iter, valid_iter, data_names, label_names): - devs = mx.cpu() if args.gpus is None or args.gpus is '' else [ - mx.gpu(int(i)) for i in args.gpus.split(',')] - module = mx.mod.Module(symbol, data_names=data_names, label_names=label_names, context=devs) +def train(symbol_data, train_iterator, valid_iterator, data_column_names, target_names): + """Train cnn model + + Parameters + ---------- + symbol_data: symbol + train_iterator: DataIter + Train DataIter + valid_iterator: DataIter + Valid DataIter + data_column_names: list of str + Defaults to ('data') for a typical model used in image classification + target_names: list of str + Defaults to ('softmax_label') for a typical model used in image classification + """ + devs = mx.cpu() # default setting + if args.gpus is not None: + for i in args.gpus.split(','): + mx.gpu(int(i)) + devs = mx.gpu() + module = mx.mod.Module(symbol_data, data_names=data_column_names, label_names=target_names, context=devs) init_params = { 'vocab_embed_weight': {'uniform': 0.1}, @@ -185,7 +266,7 @@ def train(symbol, train_iter, valid_iter, data_names, label_names): 'cls_weight': {'uniform': 0.1}, 'cls_bias': {'costant': 0}, } # custom init_params - module.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) + module.bind(data_shapes=train_iterator.provide_data, label_shapes=train_iterator.provide_label) module.init_params(CustomInit(init_params)) lr_sch = mx.lr_scheduler.FactorScheduler(step=25000, factor=0.999) module.init_optimizer( @@ -195,8 +276,8 @@ def norm_stat(d): return mx.nd.norm(d) / np.sqrt(d.size) mon = mx.mon.Monitor(25000, norm_stat) - module.fit(train_data=train_iter, - eval_data=valid_iter, + module.fit(train_data=train_iterator, + eval_data=valid_iterator, eval_metric='acc', kvstore=args.kv_store, monitor=mon, @@ -207,8 +288,7 @@ def norm_stat(d): @mx.init.register class CustomInit(Initializer): - """ - https://mxnet.incubator.apache.org/api/python/optimization.html#mxnet.initializer.register + """https://mxnet.incubator.apache.org/api/python/optimization.html#mxnet.initializer.register Create and register a custom initializer that Initialize the weight and bias with custom requirements diff --git a/example/cnn_text_classification/data_helpers.py b/example/cnn_text_classification/data_helpers.py index b6fe1e6917a3..093da7bf32bc 100644 --- a/example/cnn_text_classification/data_helpers.py +++ b/example/cnn_text_classification/data_helpers.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +"""Help functions to support for implementing CNN + Highway Network for Text Classification in MXNet""" + import itertools import os import re @@ -27,8 +29,7 @@ def clean_str(string): - """ - Tokenization/string cleaning for all datasets except for SST. + """Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) @@ -40,16 +41,15 @@ def clean_str(string): string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) - string = re.sub(r"\(", " \( ", string) - string = re.sub(r"\)", " \) ", string) - string = re.sub(r"\?", " \? ", string) + string = re.sub(r"\(", r" \( ", string) + string = re.sub(r"\)", r" \) ", string) + string = re.sub(r"\?", r" \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower() def load_data_and_labels(): - """ - Loads MR polarity data from files, splits the data into words and generates labels. + """Loads MR polarity data from files, splits the data into words and generates labels. Returns split sentences and labels. """ # Load data from files @@ -75,14 +75,12 @@ def load_data_and_labels(): def pad_sentences(sentences, padding_word=""): - """ - Pads all sentences to the same length. The length is defined by the longest sentence. + """Pads all sentences to the same length. The length is defined by the longest sentence. Returns padded sentences. """ sequence_length = max(len(x) for x in sentences) padded_sentences = [] - for i in range(len(sentences)): - sentence = sentences[i] + for i, sentence in enumerate(sentences): num_padding = sequence_length - len(sentence) new_sentence = sentence + [padding_word] * num_padding padded_sentences.append(new_sentence) @@ -90,8 +88,7 @@ def pad_sentences(sentences, padding_word=""): def build_vocab(sentences): - """ - Builds a vocabulary mapping from word to index based on the sentences. + """Builds a vocabulary mapping from word to index based on the sentences. Returns vocabulary mapping and inverse vocabulary mapping. """ # Build vocabulary @@ -104,44 +101,41 @@ def build_vocab(sentences): def build_input_data(sentences, labels, vocabulary): - """ - Maps sentencs and labels to vectors based on a vocabulary. - """ + """Maps sentencs and labels to vectors based on a vocabulary.""" x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences]) y = np.array(labels) return [x, y] -def build_input_data_with_word2vec(sentences, labels, word2vec): + +def build_input_data_with_word2vec(sentences, labels, word2vec_list): """Map sentences and labels to vectors based on a pretrained word2vec""" x_vec = [] for sent in sentences: vec = [] for word in sent: - if word in word2vec: - vec.append(word2vec[word]) + if word in word2vec_list: + vec.append(word2vec_list[word]) else: - vec.append(word2vec['']) + vec.append(word2vec_list['']) x_vec.append(vec) x_vec = np.array(x_vec) y_vec = np.array(labels) return [x_vec, y_vec] -def load_data_with_word2vec(word2vec): - """ - Loads and preprocessed data for the MR dataset. +def load_data_with_word2vec(word2vec_list): + """Loads and preprocessed data for the MR dataset. Returns input vectors, labels, vocabulary, and inverse vocabulary. """ # Load and preprocess data sentences, labels = load_data_and_labels() sentences_padded = pad_sentences(sentences) # vocabulary, vocabulary_inv = build_vocab(sentences_padded) - return build_input_data_with_word2vec(sentences_padded, labels, word2vec) + return build_input_data_with_word2vec(sentences_padded, labels, word2vec_list) def load_data(): - """ - Loads and preprocessed data for the MR dataset. + """Loads and preprocessed data for the MR dataset. Returns input vectors, labels, vocabulary, and inverse vocabulary. """ # Load and preprocess data @@ -153,9 +147,7 @@ def load_data(): def batch_iter(data, batch_size, num_epochs): - """ - Generates a batch iterator for a dataset. - """ + """Generates a batch iterator for a dataset.""" data = np.array(data) data_size = len(data) num_batches_per_epoch = int(len(data)/batch_size) + 1 @@ -170,18 +162,19 @@ def batch_iter(data, batch_size, num_epochs): def load_pretrained_word2vec(infile): + """Load the pre-trained word2vec from file.""" if isinstance(infile, str): infile = open(infile) - word2vec = {} + word2vec_list = {} for idx, line in enumerate(infile): if idx == 0: vocab_size, dim = line.strip().split() else: tks = line.strip().split() - word2vec[tks[0]] = map(float, tks[1:]) + word2vec_list[tks[0]] = map(float, tks[1:]) - return word2vec + return word2vec_list def load_google_word2vec(path): diff --git a/example/deep-embedded-clustering/model.py b/example/deep-embedded-clustering/model.py index 9b6185c9fd18..b388c551387e 100644 --- a/example/deep-embedded-clustering/model.py +++ b/example/deep-embedded-clustering/model.py @@ -18,8 +18,9 @@ # pylint: disable=missing-docstring from __future__ import print_function -import mxnet as mx import numpy as np +import mxnet as mx + try: import cPickle as pickle except ImportError: @@ -53,7 +54,7 @@ def extract_feature(sym, args, auxs, data_iter, N, xpu=mx.cpu()): class MXModel(object): - def __init__(self, xpu=mx.cpu(), *args, **kwargs): + def __init__(self, *args, xpu=mx.cpu(), **kwargs): self.xpu = xpu self.loss = None self.args = {} diff --git a/example/deep-embedded-clustering/solver.py b/example/deep-embedded-clustering/solver.py index 567c78eeb06c..79fe5c69add7 100644 --- a/example/deep-embedded-clustering/solver.py +++ b/example/deep-embedded-clustering/solver.py @@ -19,9 +19,8 @@ from __future__ import print_function import logging - -import mxnet as mx import numpy as np +import mxnet as mx class Monitor(object): @@ -148,4 +147,4 @@ def solve(self, xpu, sym, args, args_grad, auxs, if self.iter_end_callback is not None: if self.iter_end_callback(i): return - exe.outputs[0].wait_to_read() \ No newline at end of file + exe.outputs[0].wait_to_read()