diff --git a/example/bayesian-methods/algos.py b/example/bayesian-methods/algos.py
index f7b362070791..29ba3ec97d0b 100644
--- a/example/bayesian-methods/algos.py
+++ b/example/bayesian-methods/algos.py
@@ -14,13 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Create implementation of algorithms of HMC, stepHMC, SGD, SGLD and DistilledSGLD"""
 from __future__ import print_function
+import time
+import numpy
 import mxnet as mx
 import mxnet.ndarray as nd
-import time
-import logging
-from utils import *
+from utils import copy_param, get_executor, sample_test_regression, sample_test_acc
 
 
 def calc_potential(exe, params, label_name, noise_precision, prior_precision):
@@ -35,6 +35,7 @@ def calc_potential(exe, params, label_name, noise_precision, prior_precision):
 
 
 def calc_grad(exe, exe_grads, params, X, Y, label_name=None, outgrad_f=None):
+    """Calculate gradient"""
     exe.copy_params_from(params)
     exe.arg_dict['data'][:] = X
     if outgrad_f is None:
@@ -48,8 +49,8 @@ def calc_grad(exe, exe_grads, params, X, Y, label_name=None, outgrad_f=None):
         v.wait_to_read()
 
 
-def step_HMC(exe, exe_params, exe_grads, label_key, noise_precision, prior_precision, L=10,
-             eps=1E-6):
+def step_HMC(exe, exe_params, exe_grads, label_key, noise_precision, prior_precision, L=10, eps=1E-6):
+    """Generate the implementation of step HMC"""
     init_params = {k: v.copyto(v.context) for k, v in exe_params.items()}
     end_params = {k: v.copyto(v.context) for k, v in exe_params.items()}
     init_momentums = {k: mx.random.normal(0, 1, v.shape) for k, v in init_params.items()}
@@ -102,6 +103,7 @@ def step_HMC(exe, exe_params, exe_grads, label_key, noise_precision, prior_preci
 def HMC(sym, data_inputs, X, Y, X_test, Y_test, sample_num,
         initializer=None, noise_precision=1 / 9.0, prior_precision=0.1,
         learning_rate=1E-6, L=10, dev=mx.gpu()):
+    """Generate the implementation of HMC"""
     label_key = list(set(data_inputs.keys()) - set(['data']))[0]
     exe, exe_params, exe_grads, _ = get_executor(sym, dev, data_inputs, initializer)
     exe.arg_dict['data'][:] = X
@@ -134,6 +136,7 @@ def SGD(sym, data_inputs, X, Y, X_test, Y_test, total_iter_num,
         out_grad_f=None,
         initializer=None,
         minibatch_size=100, dev=mx.gpu()):
+    """Generate the implementation of SGD"""
     if out_grad_f is None:
         label_key = list(set(data_inputs.keys()) - set(['data']))[0]
     exe, params, params_grad, _ = get_executor(sym, dev, data_inputs, initializer)
@@ -173,6 +176,7 @@ def SGLD(sym, X, Y, X_test, Y_test, total_iter_num,
          initializer=None,
          minibatch_size=100, thin_interval=100, burn_in_iter_num=1000, task='classification',
          dev=mx.gpu()):
+    """Generate the implementation of SGLD"""
     if out_grad_f is None:
         label_key = list(set(data_inputs.keys()) - set(['data']))[0]
     exe, params, params_grad, _ = get_executor(sym, dev, data_inputs, initializer)
@@ -200,7 +204,7 @@ def SGLD(sym, X, Y, X_test, Y_test, total_iter_num,
         if i < burn_in_iter_num:
             continue
         else:
-            if 0 == (i - burn_in_iter_num) % thin_interval:
+            if (i - burn_in_iter_num) % thin_interval == 0:
                 if optimizer.lr_scheduler is not None:
                     lr = optimizer.lr_scheduler(optimizer.num_update)
                 else:
@@ -238,6 +242,7 @@ def DistilledSGLD(teacher_sym, student_sym,
                   minibatch_size=100,
                   task='classification',
                   dev=mx.gpu()):
+    """Generate the implementation of DistilledSGLD"""
     teacher_exe, teacher_params, teacher_params_grad, _ = \
         get_executor(teacher_sym, dev, teacher_data_inputs, teacher_initializer)
     student_exe, student_params, student_params_grad, _ = \
@@ -323,13 +328,14 @@ def DistilledSGLD(teacher_sym, student_sym,
                     sample_test_acc(teacher_exe, X=X, Y=Y, label_num=10,
                                     minibatch_size=minibatch_size)
                 print("Student: Test ACC %d/%d=%f, Train ACC %d/%d=%f" % (test_correct, test_total,
-                                                    test_acc, train_correct, train_total, train_acc))
+                                                                          test_acc, train_correct,
+                                                                          train_total, train_acc))
                 print("Teacher: Test ACC %d/%d=%f, Train ACC %d/%d=%f" \
                       % (teacher_test_correct, teacher_test_total, teacher_test_acc,
                          teacher_train_correct, teacher_train_total, teacher_train_acc))
             else:
                 print("Current Iter Num: %d" % (i + 1), "Time Spent: %f" % (end - start), "MSE:",
-                       sample_test_regression(exe=student_exe, X=X_test, Y=Y_test,
+                      sample_test_regression(exe=student_exe, X=X_test, Y=Y_test,
                                              minibatch_size=minibatch_size,
                                              save_path='regression_DSGLD.txt'))
             start = time.time()
diff --git a/example/bayesian-methods/bdk_demo.py b/example/bayesian-methods/bdk_demo.py
index cd39bfd2a7c9..83a43192b1ee 100644
--- a/example/bayesian-methods/bdk_demo.py
+++ b/example/bayesian-methods/bdk_demo.py
@@ -14,21 +14,21 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Run Stochastic Gradient Langevin Dynamics (SGLD) and Bayesian Dark Knowledge (BDK)"""
 from __future__ import print_function
-import mxnet as mx
-import mxnet.ndarray as nd
+import argparse
+import time
 import numpy
-import logging
 import matplotlib.pyplot as plt
-from scipy.stats import gaussian_kde
-import argparse
-from algos import *
-from data_loader import *
-from utils import *
+import mxnet as mx
+import mxnet.ndarray as nd
+from algos import HMC, SGD, SGLD, DistilledSGLD
+from data_loader import load_mnist, load_toy, load_synthetic
+from utils import BiasXavier, SGLDScheduler
 
 
 class CrossEntropySoftmax(mx.operator.NumpyOp):
+    """Calculate CrossEntropy softmax function"""
     def __init__(self):
         super(CrossEntropySoftmax, self).__init__(False)
 
@@ -58,6 +58,7 @@ def backward(self, out_grad, in_data, out_data, in_grad):
 
 
 class LogSoftmax(mx.operator.NumpyOp):
+    """Generate helper functions to evaluate softmax loss function"""
     def __init__(self):
         super(LogSoftmax, self).__init__(False)
 
@@ -103,6 +104,7 @@ def regression_student_grad(student_outputs, teacher_pred, teacher_noise_precisi
 
 
 def get_mnist_sym(output_op=None, num_hidden=400):
+    """Get symbol of mnist"""
     net = mx.symbol.Variable('data')
     net = mx.symbol.FullyConnected(data=net, name='mnist_fc1', num_hidden=num_hidden)
     net = mx.symbol.Activation(data=net, name='mnist_relu1', act_type="relu")
@@ -117,6 +119,7 @@ def get_mnist_sym(output_op=None, num_hidden=400):
 
 
 def synthetic_grad(X, theta, sigma1, sigma2, sigmax, rescale_grad=1.0, grad=None):
+    """Get synthetic gradient value"""
     if grad is None:
         grad = nd.empty(theta.shape, theta.context)
     theta1 = theta.asnumpy()[0]
@@ -128,17 +131,16 @@ def synthetic_grad(X, theta, sigma1, sigma2, sigmax, rescale_grad=1.0, grad=None
         -(X - theta1 - theta2) ** 2 / (2 * vx))
     grad_npy = numpy.zeros(theta.shape)
     grad_npy[0] = -rescale_grad * ((numpy.exp(-(X - theta1) ** 2 / (2 * vx)) * (X - theta1) / vx
-                                    + numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * (
-                                    X - theta1 - theta2) / vx) / denominator).sum() \
-                  + theta1 / v1
-    grad_npy[1] = -rescale_grad * ((numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * (
-    X - theta1 - theta2) / vx) / denominator).sum() \
-                  + theta2 / v2
+                                    + numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) *
+                                    (X - theta1 - theta2) / vx) / denominator).sum() + theta1 / v1
+    grad_npy[1] = -rescale_grad * ((numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) *
+                                    (X - theta1 - theta2) / vx) / denominator).sum() + theta2 / v2
     grad[:] = grad_npy
     return grad
 
 
 def get_toy_sym(teacher=True, teacher_noise_precision=None):
+    """Get toy symbol"""
     if teacher:
         net = mx.symbol.Variable('data')
         net = mx.symbol.FullyConnected(data=net, name='teacher_fc1', num_hidden=100)
@@ -160,8 +162,9 @@ def dev(gpu_id=None):
     return mx.gpu(gpu_id) if gpu_id else mx.cpu()
 
 
-def run_mnist_SGD(training_num=50000, gpu_id=None):
-    X, Y, X_test, Y_test = load_mnist(training_num)
+
+def run_mnist_SGD(num_training=50000, gpu_id=None):
+    X, Y, X_test, Y_test = load_mnist(num_training)
     minibatch_size = 100
     net = get_mnist_sym()
     data_shape = (minibatch_size,) + X.shape[1::]
@@ -175,8 +178,8 @@ def run_mnist_SGD(training_num=50000, gpu_id=None):
                              lr=5E-6, prior_precision=1.0, minibatch_size=100)
 
 
-def run_mnist_SGLD(training_num=50000, gpu_id=None):
-    X, Y, X_test, Y_test = load_mnist(training_num)
+def run_mnist_SGLD(num_training=50000, gpu_id=None):
+    X, Y, X_test, Y_test = load_mnist(num_training)
     minibatch_size = 100
     net = get_mnist_sym()
     data_shape = (minibatch_size,) + X.shape[1::]
@@ -191,10 +194,11 @@ def run_mnist_SGLD(training_num=50000, gpu_id=None):
                             thin_interval=100, burn_in_iter_num=1000)
 
 
-def run_mnist_DistilledSGLD(training_num=50000, gpu_id=None):
-    X, Y, X_test, Y_test = load_mnist(training_num)
+def run_mnist_DistilledSGLD(num_training=50000, gpu_id=None):
+    """Run DistilledSGLD on mnist dataset"""
+    X, Y, X_test, Y_test = load_mnist(num_training)
     minibatch_size = 100
-    if training_num >= 10000:
+    if num_training >= 10000:
         num_hidden = 800
         total_iter_num = 1000000
         teacher_learning_rate = 1E-6
@@ -235,6 +239,7 @@ def run_mnist_DistilledSGLD(training_num=50000, gpu_id=None):
 
 
 def run_toy_SGLD(gpu_id=None):
+    """Run SGLD on toy dataset"""
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = 1
     teacher_noise_precision = 1.0 / 9.0
@@ -243,20 +248,26 @@ def run_toy_SGLD(gpu_id=None):
     data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
                    'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))}
     initializer = mx.init.Uniform(0.07)
-    exe, params, _ = \
-        SGLD(sym=net, data_inputs=data_inputs,
-             X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=50000,
-             initializer=initializer,
-             learning_rate=1E-4,
-             #         lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5),
-             prior_precision=0.1,
-             burn_in_iter_num=1000,
-             thin_interval=10,
-             task='regression',
-             minibatch_size=minibatch_size, dev=dev(gpu_id))
-
-
-def run_toy_DistilledSGLD(gpu_id=None):
+    exe, params, _ = SGLD(sym=net,
+                          data_inputs=data_inputs,
+                          X=X,
+                          Y=Y,
+                          X_test=X_test,
+                          Y_test=Y_test,
+                          total_iter_num=50000,
+                          initializer=initializer,
+                          learning_rate=1E-4,
+                          # lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5),
+                          prior_precision=0.1,
+                          burn_in_iter_num=1000,
+                          thin_interval=10,
+                          task='regression',
+                          minibatch_size=minibatch_size,
+                          dev=dev(gpu_id))  # disable=unbalanced-tuple-unpacking
+
+
+def run_toy_DistilledSGLD(gpu_id):
+    """Run DistilledSGLD on toy dataset"""
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = 1
     teacher_noise_precision = 1.0
@@ -288,6 +299,7 @@ def run_toy_DistilledSGLD(gpu_id=None):
 
 
 def run_toy_HMC(gpu_id=None):
+    """Run HMC on toy dataset"""
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = Y.shape[0]
     noise_precision = 1 / 9.0
@@ -302,6 +314,7 @@ def run_toy_HMC(gpu_id=None):
 
 
 def run_synthetic_SGLD():
+    """Run synthetic SGLD"""
     theta1 = 0
     theta2 = 1
     sigma1 = numpy.sqrt(10)
@@ -322,14 +335,14 @@ def run_synthetic_SGLD():
     grad = nd.empty((2,), mx.cpu())
     samples = numpy.zeros((2, total_iter_num))
     start = time.time()
-    for i in xrange(total_iter_num):
+    for i in range(total_iter_num):
         if (i + 1) % 100000 == 0:
             end = time.time()
             print("Iter:%d, Time spent: %f" % (i + 1, end - start))
             start = time.time()
         ind = numpy.random.randint(0, X.shape[0])
-        synthetic_grad(X[ind], theta, sigma1, sigma2, sigmax, rescale_grad=
-        X.shape[0] / float(minibatch_size), grad=grad)
+        synthetic_grad(X[ind], theta, sigma1, sigma2, sigmax,
+                       rescale_grad=X.shape[0] / float(minibatch_size), grad=grad)
         updater('theta', grad, theta)
         samples[:, i] = theta.asnumpy()
     plt.hist2d(samples[0, :], samples[1, :], (200, 200), cmap=plt.cm.jet)
@@ -354,18 +367,18 @@ def run_synthetic_SGLD():
     args = parser.parse_args()
     training_num = args.training
     if args.dataset == 1:
-        if 0 == args.algorithm:
+        if args.algorithm == 0:
             run_mnist_SGD(training_num, gpu_id=args.gpu)
-        elif 1 == args.algorithm:
+        elif args.algorithm == 1:
             run_mnist_SGLD(training_num, gpu_id=args.gpu)
         else:
             run_mnist_DistilledSGLD(training_num, gpu_id=args.gpu)
     elif args.dataset == 0:
-        if 1 == args.algorithm:
+        if args.algorithm == 1:
             run_toy_SGLD(gpu_id=args.gpu)
-        elif 2 == args.algorithm:
+        elif args.algorithm == 2:
             run_toy_DistilledSGLD(gpu_id=args.gpu)
-        elif 3 == args.algorithm:
+        elif args.algorithm == 3:
             run_toy_HMC(gpu_id=args.gpu)
     else:
         run_synthetic_SGLD()
diff --git a/example/bayesian-methods/data_loader.py b/example/bayesian-methods/data_loader.py
index 92ca0cfb3a6c..a0e71bb8d746 100644
--- a/example/bayesian-methods/data_loader.py
+++ b/example/bayesian-methods/data_loader.py
@@ -14,14 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Create helper functions to load mnist dataset and toy dataset"""
 from __future__ import print_function
-import numpy
 import os
 import ssl
+import numpy
 
 
 def load_mnist(training_num=50000):
+    """Load mnist dataset"""
     data_path = os.path.join(os.path.dirname(os.path.realpath('__file__')), 'mnist.npz')
     if not os.path.isfile(data_path):
         from six.moves import urllib
diff --git a/example/bayesian-methods/utils.py b/example/bayesian-methods/utils.py
index a2744373e87d..b0ea1f37e6bb 100644
--- a/example/bayesian-methods/utils.py
+++ b/example/bayesian-methods/utils.py
@@ -14,11 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Generate helper functions to Stochastic Gradient Langevin Dynamics (SGLD) and Bayesian Dark Knowledge (BDK)"""
+import numpy
 import mxnet as mx
 import mxnet.ndarray as nd
-import numpy
-import logging
 
 
 class BiasXavier(mx.initializer.Xavier):
@@ -26,7 +25,9 @@ def _init_bias(self, _, arr):
         scale = numpy.sqrt(self.magnitude / arr.shape[0])
         mx.random.uniform(-scale, scale, out=arr)
 
+
 class SGLDScheduler(mx.lr_scheduler.LRScheduler):
+    """Create SGLDScheduler class"""
     def __init__(self, begin_rate, end_rate, total_iter_num, factor):
         super(SGLDScheduler, self).__init__()
         if factor >= 1.0:
@@ -44,7 +45,9 @@ def __call__(self, num_update):
         self.count += 1
         return self.base_lr
 
+
 def get_executor(sym, ctx, data_inputs, initializer=None):
+    """Get executor to Stochastic Gradient Langevin Dynamics and/or Bayesian Dark Knowledge"""
     data_shapes = {k: v.shape for k, v in data_inputs.items()}
     arg_names = sym.list_arguments()
     aux_names = sym.list_auxiliary_states()
@@ -62,14 +65,18 @@ def get_executor(sym, ctx, data_inputs, initializer=None):
             initializer(k, v)
     return exe, params, params_grad, aux_states
 
+
 def copy_param(exe, new_param=None):
+    """Create copy of parameters"""
     if new_param is None:
-        new_param = {k: nd.empty(v.shape, ctx=mx.cpu()) for k,v in exe.arg_dict.items()}
+        new_param = {k: nd.empty(v.shape, ctx=mx.cpu()) for k, v in exe.arg_dict.items()}
     for k, v in new_param.items():
         exe.arg_dict[k].copyto(v)
     return new_param
 
+
 def sample_test_acc(exe, X, Y, sample_pool=None, label_num=None, minibatch_size=100):
+    """Generate sample test to evaluate accuracy"""
     if label_num is None:
         pred = numpy.zeros((X.shape[0],)).astype('float32')
     else:
@@ -89,12 +96,12 @@ def sample_test_acc(exe, X, Y, sample_pool=None, label_num=None, minibatch_size=
     else:
         old_param = copy_param(exe)
         for sample in sample_pool:
-            if type(sample) is list:
+            if isinstance(sample, list):
                 denominator += sample[0]
             else:
                 denominator += 1.0
         for sample in sample_pool:
-            if type(sample) is list:
+            if isinstance(sample, list):
                 ratio = sample[0]/denominator
                 param = sample[1]
             else:
@@ -118,11 +125,12 @@ def sample_test_acc(exe, X, Y, sample_pool=None, label_num=None, minibatch_size=
 
 
 def sample_test_regression(exe, X, Y, sample_pool=None, minibatch_size=100, save_path="regression.txt"):
+    """Generate a sample test regression"""
     old_param = copy_param(exe)
     if sample_pool is not None:
         pred = numpy.zeros(Y.shape + (len(sample_pool),))
         ratio = numpy.zeros((len(sample_pool),))
-        if type(sample_pool[0]) is list:
+        if isinstance(sample_pool[0], list):
             denominator = sum(sample[0] for sample in sample_pool)
             for i, sample in enumerate(sample_pool):
                 ratio[i] = sample[0]/float(denominator)
@@ -130,7 +138,7 @@ def sample_test_regression(exe, X, Y, sample_pool=None, minibatch_size=100, save
             ratio[:] = 1.0/ Y.shape[0]
         iterator = mx.io.NDArrayIter(data=X, label=Y, batch_size=minibatch_size, shuffle=False)
         for i, sample in enumerate(sample_pool):
-            if type(sample) is list:
+            if isinstance(sample, list):
                 sample_param = sample[1]
             else:
                 sample_param = sample
@@ -146,7 +154,7 @@ def sample_test_regression(exe, X, Y, sample_pool=None, minibatch_size=100, save
                 curr_instance += batch_len
         mean = pred.mean(axis=2)
         var = pred.std(axis=2)**2
-        #print numpy.concatenate((Y, mean), axis=1)
+        # print numpy.concatenate((Y, mean), axis=1)
         mse = numpy.square(Y.reshape((Y.shape[0], )) - mean.reshape((mean.shape[0], ))).mean()
         numpy.savetxt(save_path, numpy.concatenate((mean, var), axis=1))
     else:
@@ -157,15 +165,19 @@ def sample_test_regression(exe, X, Y, sample_pool=None, minibatch_size=100, save
         for batch in iterator:
             exe.arg_dict['data'][:] = batch.data[0]
             exe.forward(is_train=False)
-            mean_var[curr_instance:curr_instance + minibatch_size - batch.pad, 0] = exe.outputs[0].asnumpy()[:minibatch_size - batch.pad].flatten()
-            mean_var[curr_instance:curr_instance + minibatch_size - batch.pad, 1] = numpy.exp(exe.outputs[1].asnumpy())[:minibatch_size - batch.pad].flatten()
+            mean_var[curr_instance:curr_instance + minibatch_size - batch.pad, 0] =\
+                exe.outputs[0].asnumpy()[:minibatch_size - batch.pad].flatten()
+            mean_var[curr_instance:curr_instance + minibatch_size - batch.pad, 1] = \
+                numpy.exp(exe.outputs[1].asnumpy())[:minibatch_size - batch.pad].flatten()
             curr_instance += minibatch_size - batch.pad
         mse = numpy.square(Y.reshape((Y.shape[0],)) - mean_var[:, 0]).mean()
         numpy.savetxt(save_path, mean_var)
     exe.copy_params_from(old_param)
     return mse
 
+
 def pred_test(testing_data, exe, param_list=None, save_path=""):
+    """Generate prediction on testset"""
     ret = numpy.zeros((testing_data.shape[0], 2))
     if param_list is None:
         for i in range(testing_data.shape[0]):
@@ -177,8 +189,8 @@ def pred_test(testing_data, exe, param_list=None, save_path=""):
     else:
         for i in range(testing_data.shape[0]):
             pred = numpy.zeros((len(param_list),))
-            for j in range(len(param_list)):
-                exe.copy_params_from(param_list[j])
+            for (j, param) in enumerate(param_list):
+                exe.copy_params_from(param)
                 exe.arg_dict['data'][:] = testing_data[i, 0]
                 exe.forward(is_train=False)
                 pred[j] = exe.outputs[0].asnumpy()
diff --git a/example/caffe/caffe_net.py b/example/caffe/caffe_net.py
index 0459c901e1cb..6796fca5c6a8 100644
--- a/example/caffe/caffe_net.py
+++ b/example/caffe/caffe_net.py
@@ -14,64 +14,80 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Generate helper functions to load Caffe into MXNet"""
+import argparse
 import mxnet as mx
 from data import get_iterator
-import argparse
 import train_model
 
+
 def get_mlp():
-    """
-    multi-layer perceptron
-    """
+    """Get multi-layer perceptron"""
     data = mx.symbol.Variable('data')
-    fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
+    fc1 = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1',
+                            prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
     act1 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
-    fc2  = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
+    fc2 = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2',
+                            prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
     act2 = mx.symbol.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
-    fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
+    fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3',
+                            prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
     if use_caffe_loss:
         label = mx.symbol.Variable('softmax_label')
-        mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+        mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax',
+                                  prototxt="layer{type:\"SoftmaxWithLoss\"}")
     else:
         mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
     return mlp
 
+
 def get_lenet():
-    """
-    LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick
+    """LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick
     Haffner. "Gradient-based learning applied to document recognition."
     Proceedings of the IEEE (1998)
     """
     data = mx.symbol.Variable('data')
 
     # first conv
-    conv1 = mx.symbol.CaffeOp(data_0=data, num_weight=2, prototxt="layer{type:\"Convolution\" convolution_param { num_output: 20 kernel_size: 5 stride: 1} }")
+    conv1 = mx.symbol.CaffeOp(data_0=data, num_weight=2,
+                              prototxt="layer{type:\"Convolution\" "
+                                       "convolution_param { num_output: 20 kernel_size: 5 stride: 1} }")
     act1 = mx.symbol.CaffeOp(data_0=conv1, prototxt="layer{type:\"TanH\"}")
-    pool1 = mx.symbol.CaffeOp(data_0=act1, prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}")
+    pool1 = mx.symbol.CaffeOp(data_0=act1,
+                              prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}")
 
     # second conv
-    conv2 = mx.symbol.CaffeOp(data_0=pool1, num_weight=2, prototxt="layer{type:\"Convolution\" convolution_param { num_output: 50 kernel_size: 5 stride: 1} }")
+    conv2 = mx.symbol.CaffeOp(data_0=pool1, num_weight=2,
+                              prototxt="layer{type:\"Convolution\" "
+                                       "convolution_param { num_output: 50 kernel_size: 5 stride: 1} }")
     act2 = mx.symbol.CaffeOp(data_0=conv2, prototxt="layer{type:\"TanH\"}")
-    pool2 = mx.symbol.CaffeOp(data_0=act2, prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}")
+    pool2 = mx.symbol.CaffeOp(data_0=act2,
+                              prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}")
 
-    fc1 = mx.symbol.CaffeOp(data_0=pool2, num_weight=2, prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 500} }")
+    fc1 = mx.symbol.CaffeOp(data_0=pool2, num_weight=2,
+                            prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 500} }")
     act3 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
 
     # second fullc
-    fc2 = mx.symbol.CaffeOp(data_0=act3, num_weight=2, prototxt="layer{type:\"InnerProduct\"inner_product_param{num_output: 10} }")
+    fc2 = mx.symbol.CaffeOp(data_0=act3, num_weight=2,
+                            prototxt="layer{type:\"InnerProduct\"inner_product_param{num_output: 10} }")
     if use_caffe_loss:
         label = mx.symbol.Variable('softmax_label')
-        lenet = mx.symbol.CaffeLoss(data=fc2, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+        lenet = mx.symbol.CaffeLoss(data=fc2, label=label, grad_scale=1, name='softmax',
+                                    prototxt="layer{type:\"SoftmaxWithLoss\"}")
     else:
         lenet = mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
     return lenet
 
+
 def get_network_from_json_file(file_name):
     network = mx.sym.load(file_name)
     return network
 
+
 def parse_args():
+    """Parse the arguments
+    """
     parser = argparse.ArgumentParser(description='train an image classifier on mnist')
     parser.add_argument('--network', type=str, default='lenet',
                         help='the cnn to use (mlp | lenet | <path to network json file>')
diff --git a/example/caffe/data.py b/example/caffe/data.py
index 15276c423601..f6bbc0f0daf6 100644
--- a/example/caffe/data.py
+++ b/example/caffe/data.py
@@ -14,42 +14,44 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Create the helper functions to mnist dataset for Caffe operators in MXNet"""
 import mxnet as mx
 from mxnet.test_utils import get_mnist_ubyte
 
+
 def get_iterator(data_shape, use_caffe_data):
+    """Generate the iterator of mnist dataset"""
     def get_iterator_impl_mnist(args, kv):
         """return train and val iterators for mnist"""
         # download data
         get_mnist_ubyte()
         flat = False if len(data_shape) != 1 else True
 
-        train           = mx.io.MNISTIter(
-            image       = "data/train-images-idx3-ubyte",
-            label       = "data/train-labels-idx1-ubyte",
-            input_shape = data_shape,
-            batch_size  = args.batch_size,
-            shuffle     = True,
-            flat        = flat,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank)
+        train = mx.io.MNISTIter(
+            image="data/train-images-idx3-ubyte",
+            label="data/train-labels-idx1-ubyte",
+            input_shape=data_shape,
+            batch_size=args.batch_size,
+            shuffle=True,
+            flat=flat,
+            num_parts=kv.num_workers,
+            part_index=kv.rank)
 
         val = mx.io.MNISTIter(
-            image       = "data/t10k-images-idx3-ubyte",
-            label       = "data/t10k-labels-idx1-ubyte",
-            input_shape = data_shape,
-            batch_size  = args.batch_size,
-            flat        = flat,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank)
+            image="data/t10k-images-idx3-ubyte",
+            label="data/t10k-labels-idx1-ubyte",
+            input_shape=data_shape,
+            batch_size=args.batch_size,
+            flat=flat,
+            num_parts=kv.num_workers,
+            part_index=kv.rank)
 
         return (train, val)
 
     def get_iterator_impl_caffe(args, kv):
         flat = False if len(data_shape) != 1 else True
         train = mx.io.CaffeDataIter(
-            prototxt =
+            prototxt=
             'layer { \
                 name: "mnist" \
                 type: "Data" \
@@ -67,13 +69,13 @@ def get_iterator_impl_caffe(args, kv):
                     backend: LMDB \
                 } \
             }',
-            flat           = flat,
-            num_examples   = 60000
+            flat=flat,
+            num_examples=60000
             # float32 is the default, so left out here in order to illustrate
         )
 
         val = mx.io.CaffeDataIter(
-            prototxt =
+            prototxt=
             'layer { \
                 name: "mnist" \
                 type: "Data" \
@@ -91,9 +93,9 @@ def get_iterator_impl_caffe(args, kv):
                     backend: LMDB \
                 } \
             }',
-            flat           = flat,
-            num_examples   = 10000,
-            dtype          = "float32" # float32 is the default
+            flat=flat,
+            num_examples=10000,
+            dtype="float32"  # float32 is the default
         )
 
         return train, val
diff --git a/example/caffe/train_model.py b/example/caffe/train_model.py
index 4290e71063e8..16b18674fe7c 100644
--- a/example/caffe/train_model.py
+++ b/example/caffe/train_model.py
@@ -14,12 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-import mxnet as mx
-import logging
+"""Train module with using Caffe operator in MXNet"""
 import os
+import logging
+import mxnet as mx
+
 
 def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
+    """Train the model with using Caffe operator in MXNet"""
     # kvstore
     kv = mx.kvstore.create(args.kv_store)
 
@@ -74,8 +76,8 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
 
     if 'lr_factor' in args and args.lr_factor < 1:
         model_args['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
-            step = max(int(epoch_size * args.lr_factor_epoch), 1),
-            factor = args.lr_factor)
+            step=max(int(epoch_size * args.lr_factor_epoch), 1),
+            factor=args.lr_factor)
 
     if 'clip_gradient' in args and args.clip_gradient is not None:
         model_args['clip_gradient'] = args.clip_gradient
@@ -85,12 +87,11 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
             args.gpus is None or len(args.gpus.split(',')) is 1):
         kv = None
 
-
     mod = mx.mod.Module(network, context=devs)
 
     if eval_metrics is None:
         eval_metrics = ['accuracy']
-        ## TopKAccuracy only allows top_k > 1
+        # TopKAccuracy only allows top_k > 1
         for top_k in [5, 10, 20]:
             eval_metrics.append(mx.metric.create('top_k_accuracy', top_k=top_k))
 
@@ -102,8 +103,7 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
     batch_end_callback.append(mx.callback.Speedometer(args.batch_size, 50))
 
     mod.fit(train_data=train, eval_metric=eval_metrics, eval_data=val, optimizer='sgd',
-        optimizer_params={'learning_rate':args.lr, 'momentum': 0.9, 'wd': 0.00001},
-        num_epoch=args.num_epochs, batch_end_callback=batch_end_callback,
-        initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
-        kvstore=kv, epoch_end_callback=checkpoint, **model_args)
-
+            optimizer_params={'learning_rate':args.lr, 'momentum': 0.9, 'wd': 0.00001},
+            num_epoch=args.num_epochs, batch_end_callback=batch_end_callback,
+            initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
+            kvstore=kv, epoch_end_callback=checkpoint, **model_args)
diff --git a/example/capsnet/capsulelayers.py b/example/capsnet/capsulelayers.py
index 5ac4fad49149..077a4003f7a9 100644
--- a/example/capsnet/capsulelayers.py
+++ b/example/capsnet/capsulelayers.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Create layers of capsule net"""
 import mxnet as mx
 
 
@@ -41,8 +41,7 @@ def primary_caps(data, dim_vector, n_channels, kernel, strides, name=''):
 
 
 class CapsuleLayer:
-    """
-    The capsule layer with dynamic routing.
+    """The capsule layer with dynamic routing.
     [batch_size, input_num_capsule, input_dim_vector] => [batch_size, num_capsule, dim_vector]
     """
 
@@ -98,7 +97,8 @@ def __call__(self, data):
                     mx.sym.sum(mx.sym.broadcast_mul(c, inputs_hat_stopped, name='broadcast_mul_' + str(i)),
                                axis=1, keepdims=True,
                                name='sum_' + str(i)), name='output_' + str(i), squash_axis=4)
-                bias_ = bias_ + mx.sym.sum(mx.sym.broadcast_mul(c, inputs_hat_stopped, name='bias_broadcast_mul' + str(i)),
+                bias_ = bias_ + mx.sym.sum(mx.sym.broadcast_mul(c, inputs_hat_stopped,
+                                                                name='bias_broadcast_mul' + str(i)),
                                            axis=4,
                                            keepdims=True, name='bias_' + str(i))
 
diff --git a/example/capsnet/capsulenet.py b/example/capsnet/capsulenet.py
index 67108757bf39..05df9cdc56c4 100644
--- a/example/capsnet/capsulenet.py
+++ b/example/capsnet/capsulenet.py
@@ -14,24 +14,27 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import mxnet as mx
-import numpy as np
+"""Generate MXNet implementation of CapsNet"""
 import os
 import re
 import gzip
 import struct
+import numpy as np
 import scipy.ndimage as ndi
+import mxnet as mx
 from capsulelayers import primary_caps, CapsuleLayer
 
 from mxboard import SummaryWriter
 
+
 def margin_loss(y_true, y_pred):
     loss = y_true * mx.sym.square(mx.sym.maximum(0., 0.9 - y_pred)) +\
         0.5 * (1 - y_true) * mx.sym.square(mx.sym.maximum(0., y_pred - 0.1))
     return mx.sym.mean(data=mx.sym.sum(loss, 1))
 
 
-def capsnet(batch_size, n_class, num_routing,recon_loss_weight):
+def capsnet(batch_size, n_class, num_routing, recon_loss_weight):
+    """Create CapsNet"""
     # data.shape = [batch_size, 1, 28, 28]
     data = mx.sym.Variable('data')
 
@@ -107,7 +110,8 @@ def read_data(label_url, image_url):
         label = np.fromstring(flbl.read(), dtype=np.int8)
     with gzip.open(download_data(image_url), 'rb') as fimg:
         magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
-        image = np.fromstring(fimg.read(), dtype=np.uint8).reshape(len(label), rows, cols)
+        image = np.fromstring(fimg.read(), dtype=np.uint8)
+        np.reshape(image, len(label), (rows, cols))
     return label, image
 
 
@@ -116,10 +120,11 @@ def to4d(img):
 
 
 class LossMetric(mx.metric.EvalMetric):
-    def __init__(self, batch_size, num_gpu):
+    """Evaluate the loss function"""
+    def __init__(self, batch_size, num_gpus):
         super(LossMetric, self).__init__('LossMetric')
         self.batch_size = batch_size
-        self.num_gpu = num_gpu
+        self.num_gpu = num_gpus
         self.sum_metric = 0
         self.num_inst = 0
         self.loss = 0.0
@@ -130,6 +135,7 @@ def __init__(self, batch_size, num_gpu):
         self.n_batch = 0
 
     def update(self, labels, preds):
+        """Update the hyper-parameters and loss of CapsNet"""
         batch_sum_metric = 0
         batch_num_inst = 0
         for label, pred_outcaps in zip(labels[0], preds[0]):
@@ -146,7 +152,7 @@ def update(self, labels, preds):
         self.batch_sum_metric = batch_sum_metric
         self.batch_num_inst = batch_num_inst
         self.batch_loss = batch_loss
-        self.n_batch += 1 
+        self.n_batch += 1
 
     def get_name_value(self):
         acc = float(self.sum_metric)/float(self.num_inst)
@@ -184,6 +190,7 @@ def __call__(self, num_update):
 
 
 def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay):
+    """Run training to CapsNet"""
     summary_writer = SummaryWriter(args.tblog_dir)
     lr_scheduler = SimpleLRScheduler(learning_rate)
     optimizer_params = {'lr_scheduler': lr_scheduler}
@@ -218,7 +225,8 @@ def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, deca
         summary_writer.add_scalar('val_loss', val_loss, n_epoch)
         summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch)
 
-        print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss, train_recon_err))
+        print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss,
+                                                                        train_recon_err))
         print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, val_acc, val_loss, val_recon_err))
         print('SAVE CHECKPOINT')
 
@@ -227,10 +235,8 @@ def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, deca
         lr_scheduler.learning_rate = learning_rate * (decay ** n_epoch)
 
 
-def apply_transform(x,
-                    transform_matrix,
-                    fill_mode='nearest',
-                    cval=0.):
+def apply_transform(x, transform_matrix, fill_mode='nearest', cval=0.):
+    """Apply transform on nd.array"""
     x = np.rollaxis(x, 0, 0)
     final_affine_matrix = transform_matrix[:2, :2]
     final_offset = transform_matrix[:2, 2]
@@ -255,30 +261,45 @@ def random_shift(x, width_shift_fraction, height_shift_fraction):
     x = apply_transform(x, shift_matrix, 'nearest')
     return x
 
+
 def _shuffle(data, idx):
     """Shuffle the data."""
     shuffle_data = []
 
-    for k, v in data:
-        shuffle_data.append((k, mx.ndarray.array(v.asnumpy()[idx], v.context)))
+    for idx_k, idx_v in data:
+        shuffle_data.append((idx_k, mx.ndarray.array(idx_v.asnumpy()[idx], idx_v.context)))
 
     return shuffle_data
 
+
 class MNISTCustomIter(mx.io.NDArrayIter):
-    
+    """Create custom iterator of mnist dataset"""
+    def __init__(self, data, label, batch_size, shuffle):
+        self.data = data
+        self.label = label
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.cursor = None
+
     def reset(self):
+        """Reset class MNISTCustomIter(mx.io.NDArrayIter):"""
         # shuffle data
         if self.is_train:
             np.random.shuffle(self.idx)
             self.data = _shuffle(self.data, self.idx)
             self.label = _shuffle(self.label, self.idx)
+
         if self.last_batch_handle == 'roll_over' and self.cursor > self.num_data:
-            self.cursor = -self.batch_size + (self.cursor%self.num_data)%self.batch_size
+            self.cursor = -self.batch_size + (self.cursor % self.num_data) % self.batch_size
         else:
             self.cursor = -self.batch_size
+
     def set_is_train(self, is_train):
+        """Set training flag"""
         self.is_train = is_train
+
     def next(self):
+        """Generate next of iterator"""
         if self.iter_next():
             if self.is_train:
                 data_raw_list = self.getdata()
@@ -288,8 +309,7 @@ def next(self):
                 return mx.io.DataBatch(data=[mx.nd.array(data_shifted)], label=self.getlabel(),
                                        pad=self.getpad(), index=None)
             else:
-                 return mx.io.DataBatch(data=self.getdata(), label=self.getlabel(), \
-                                  pad=self.getpad(), index=None)
+                return mx.io.DataBatch(data=self.getdata(), label=self.getlabel(), pad=self.getpad(), index=None)
 
         else:
             raise StopIteration
@@ -298,10 +318,9 @@ def next(self):
 if __name__ == "__main__":
     # Read mnist data set
     path = 'http://yann.lecun.com/exdb/mnist/'
-    (train_lbl, train_img) = read_data(
-        path + 'train-labels-idx1-ubyte.gz', path + 'train-images-idx3-ubyte.gz')
-    (val_lbl, val_img) = read_data(
-        path + 't10k-labels-idx1-ubyte.gz', path + 't10k-images-idx3-ubyte.gz')
+    (train_lbl, train_img) = read_data(path + 'train-labels-idx1-ubyte.gz', path + 'train-images-idx3-ubyte.gz')
+    (val_lbl, val_img) = read_data(path + 't10k-labels-idx1-ubyte.gz', path + 't10k-images-idx3-ubyte.gz')
+
     # set batch size
     import argparse
     parser = argparse.ArgumentParser()
@@ -331,10 +350,13 @@ def next(self):
     # generate train_iter, val_iter
     train_iter = MNISTCustomIter(data=to4d(train_img), label=train_lbl, batch_size=int(args.batch_size), shuffle=True)
     train_iter.set_is_train(True)
-    val_iter = MNISTCustomIter(data=to4d(val_img), label=val_lbl, batch_size=int(args.batch_size),)
+    val_iter = MNISTCustomIter(data=to4d(val_img), label=val_lbl, batch_size=int(args.batch_size), shuffle=True)
     val_iter.set_is_train(False)
     # define capsnet
-    final_net = capsnet(batch_size=int(args.batch_size/num_gpu), n_class=10, num_routing=args.num_routing, recon_loss_weight=args.recon_loss_weight)
+    final_net = capsnet(batch_size=int(args.batch_size/num_gpu),
+                        n_class=10,
+                        num_routing=args.num_routing,
+                        recon_loss_weight=args.recon_loss_weight)
     # set metric
     loss_metric = LossMetric(args.batch_size/num_gpu, 1)
 
@@ -343,5 +365,6 @@ def next(self):
     module.bind(data_shapes=train_iter.provide_data,
                 label_shapes=val_iter.provide_label,
                 for_training=True)
+
     do_training(num_epoch=args.num_epoch, optimizer='adam', kvstore='device', learning_rate=args.lr,
                 model_prefix=args.model_prefix, decay=args.decay)
diff --git a/example/cnn_chinese_text_classification/data_helpers.py b/example/cnn_chinese_text_classification/data_helpers.py
index b3a13deec771..49bb3d5dc275 100644
--- a/example/cnn_chinese_text_classification/data_helpers.py
+++ b/example/cnn_chinese_text_classification/data_helpers.py
@@ -14,6 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+"""Help functions to support for implementing CNN + Highway Network for Chinese Text Classification in MXNet"""
+
 import codecs
 import itertools
 import os
@@ -27,8 +30,7 @@
 
 
 def clean_str(string):
-    """
-    Tokenization/string cleaning for all datasets except for SST.
+    """Tokenization/string cleaning for all datasets except for SST.
     Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
     """
     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
@@ -40,27 +42,28 @@ def clean_str(string):
     string = re.sub(r"\'ll", " \'ll", string)
     string = re.sub(r",", " , ", string)
     string = re.sub(r"!", " ! ", string)
-    string = re.sub(r"\(", " \( ", string)
-    string = re.sub(r"\)", " \) ", string)
-    string = re.sub(r"\?", " \? ", string)
+    string = re.sub(r"\(", r" \( ", string)
+    string = re.sub(r"\)", r" \) ", string)
+    string = re.sub(r"\?", r" \? ", string)
     string = re.sub(r"\s{2,}", " ", string)
     return string.strip().lower()
 
 
 def get_chinese_text():
+    """Download the chinese_text dataset and unzip it"""
     if not os.path.isdir("data/"):
         os.system("mkdir data/")
     if (not os.path.exists('data/pos.txt')) or \
        (not os.path.exists('data/neg')):
-        os.system("wget -q https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/chinese_text.zip -P data/")
+        os.system("wget -q https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/chinese_text.zip "
+                  "-P data/")
         os.chdir("./data")
         os.system("unzip -u chinese_text.zip")
         os.chdir("..")
 
 
 def load_data_and_labels():
-    """
-    Loads MR polarity data from files, splits the data into words and generates labels.
+    """Loads MR polarity data from files, splits the data into words and generates labels.
     Returns split sentences and labels.
     """
     # download dataset
@@ -86,14 +89,14 @@ def load_data_and_labels():
 
 
 def pad_sentences(sentences, padding_word="</s>"):
-    """
-    Pads all sentences to the same length. The length is defined by the longest sentence.
+    """Pads all sentences to the same length. The length is defined by the longest sentence.
     Returns padded sentences.
     """
     sequence_length = max(len(x) for x in sentences)
     padded_sentences = []
-    for i in range(len(sentences)):
-        sentence = sentences[i]
+    for i, element in enumerate(sentences):
+        print(i, element)
+        sentence = element
         num_padding = sequence_length - len(sentence)
         new_sentence = sentence + [padding_word] * num_padding
         padded_sentences.append(new_sentence)
@@ -101,8 +104,7 @@ def pad_sentences(sentences, padding_word="</s>"):
 
 
 def build_vocab(sentences):
-    """
-    Builds a vocabulary mapping from word to index based on the sentences.
+    """Builds a vocabulary mapping from word to index based on the sentences.
     Returns vocabulary mapping and inverse vocabulary mapping.
     """
     # Build vocabulary
@@ -115,45 +117,41 @@ def build_vocab(sentences):
 
 
 def build_input_data(sentences, labels, vocabulary):
-    """
-    Maps sentencs and labels to vectors based on a vocabulary.
-    """
+    """Maps sentences and labels to vectors based on a vocabulary."""
     x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
     y = np.array(labels)
     return [x, y]
 
 
-def build_input_data_with_word2vec(sentences, labels, word2vec):
-    """Map sentences and labels to vectors based on a pretrained word2vec"""
+def build_input_data_with_word2vec(sentences, labels, word2vec_list):
+    """Map sentences and labels to vectors based on a pre-trained word2vec"""
     x_vec = []
     for sent in sentences:
         vec = []
         for word in sent:
-            if word in word2vec:
-                vec.append(word2vec[word])
+            if word in word2vec_list:
+                vec.append(word2vec_list[word])
             else:
-                vec.append(word2vec['</s>'])
+                vec.append(word2vec_list['</s>'])
         x_vec.append(vec)
     x_vec = np.array(x_vec)
     y_vec = np.array(labels)
     return [x_vec, y_vec]
 
 
-def load_data_with_word2vec(word2vec):
-    """
-    Loads and preprocessed data for the MR dataset.
+def load_data_with_word2vec(word2vec_list):
+    """Loads and preprocessed data for the MR dataset.
     Returns input vectors, labels, vocabulary, and inverse vocabulary.
     """
     # Load and preprocess data
     sentences, labels = load_data_and_labels()
     sentences_padded = pad_sentences(sentences)
     # vocabulary, vocabulary_inv = build_vocab(sentences_padded)
-    return build_input_data_with_word2vec(sentences_padded, labels, word2vec)
+    return build_input_data_with_word2vec(sentences_padded, labels, word2vec_list)
 
 
 def load_data():
-    """
-    Loads and preprocessed data for the MR dataset.
+    """Loads and preprocessed data for the MR dataset.
     Returns input vectors, labels, vocabulary, and inverse vocabulary.
     """
     # Load and preprocess data
@@ -165,9 +163,7 @@ def load_data():
 
 
 def batch_iter(data, batch_size, num_epochs):
-    """
-    Generates a batch iterator for a dataset.
-    """
+    """Generates a batch iterator for a dataset."""
     data = np.array(data)
     data_size = len(data)
     num_batches_per_epoch = int(len(data) / batch_size) + 1
@@ -182,18 +178,19 @@ def batch_iter(data, batch_size, num_epochs):
 
 
 def load_pretrained_word2vec(infile):
+    """Load the pre-trained word2vec from file."""
     if isinstance(infile, str):
         infile = open(infile)
 
-    word2vec = {}
+    word2vec_list = {}
     for idx, line in enumerate(infile):
         if idx == 0:
             vocab_size, dim = line.strip().split()
         else:
             tks = line.strip().split()
-            word2vec[tks[0]] = map(float, tks[1:])
+            word2vec_list[tks[0]] = map(float, tks[1:])
 
-    return word2vec
+    return word2vec_list
 
 
 def load_google_word2vec(path):
diff --git a/example/cnn_chinese_text_classification/text_cnn.py b/example/cnn_chinese_text_classification/text_cnn.py
index 4598a52e6674..ce706813637a 100644
--- a/example/cnn_chinese_text_classification/text_cnn.py
+++ b/example/cnn_chinese_text_classification/text_cnn.py
@@ -20,12 +20,14 @@
 
 # -*- coding: utf-8 -*-
 
-import sys, os
-import mxnet as mx
-import numpy as np
-import argparse
+"""Implementing CNN + Highway Network for Chinese Text Classification in MXNet"""
+
+import os
+import sys
 import logging
-import time
+import argparse
+import numpy as np
+import mxnet as mx
 
 from mxnet import random
 from mxnet.initializer import Xavier, Initializer
@@ -63,12 +65,28 @@
 
 
 def save_model():
+    """Save cnn model
+
+    Returns
+    ----------
+    callback: A callback function that can be passed as epoch_end_callback to fit
+    """
     if not os.path.exists("checkpoint"):
         os.mkdir("checkpoint")
     return mx.callback.do_checkpoint("checkpoint/checkpoint", args.save_period)
 
 
 def highway(data):
+    """Construct highway net
+
+    Parameters
+    ----------
+    data:
+
+    Returns
+    ----------
+    Highway Networks
+    """
     _data = data
     high_weight = mx.sym.Variable('high_weight')
     high_bias = mx.sym.Variable('high_bias')
@@ -85,20 +103,41 @@ def highway(data):
 
 
 def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
+    """Construct data iter
+
+    Parameters
+    ----------
+    batch_size: int
+    num_embed: int
+    pre_trained_word2vec: boolean
+                        identify the pre-trained layers or not
+    Returns
+    ----------
+    train_set: DataIter
+                Train DataIter
+    valid: DataIter
+                Valid DataIter
+    sentences_size: int
+                array dimensions
+    embedded_size: int
+                array dimensions
+    vocab_size: int
+                array dimensions
+    """
     logger.info('Loading data...')
     if pre_trained_word2vec:
         word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
         x, y = data_helpers.load_data_with_word2vec(word2vec)
-        # reshpae for convolution input
+        # reshape for convolution input
         x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2]))
-        embed_size = x.shape[-1]
-        sentence_size = x.shape[2]
-        vocab_size = -1
+        embedded_size = x.shape[-1]
+        sentences_size = x.shape[2]
+        vocabulary_size = -1
     else:
         x, y, vocab, vocab_inv = data_helpers.load_data()
-        embed_size = num_embed
-        sentence_size = x.shape[1]
-        vocab_size = len(vocab)
+        embedded_size = num_embed
+        sentences_size = x.shape[1]
+        vocabulary_size = len(vocab)
 
     # randomly shuffle data
     np.random.seed(10)
@@ -109,30 +148,55 @@ def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
     # split train/valid set
     x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
     y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
-    logger.info('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
+    logger.info('Train/Valid split: %d/%d', len(y_train), len(y_dev))
     logger.info('train shape: %(shape)s', {'shape': x_train.shape})
     logger.info('valid shape: %(shape)s', {'shape': x_dev.shape})
-    logger.info('sentence max words: %(shape)s', {'shape': sentence_size})
-    logger.info('embedding size: %(msg)s', {'msg': embed_size})
-    logger.info('vocab size: %(msg)s', {'msg': vocab_size})
+    logger.info('sentence max words: %(shape)s', {'shape': sentences_size})
+    logger.info('embedding size: %(msg)s', {'msg': embedded_size})
+    logger.info('vocab size: %(msg)s', {'msg': vocabulary_size})
 
-    train = mx.io.NDArrayIter(
+    train_set = mx.io.NDArrayIter(
         x_train, y_train, batch_size, shuffle=True)
     valid = mx.io.NDArrayIter(
         x_dev, y_dev, batch_size)
-    return (train, valid, sentence_size, embed_size, vocab_size)
+    return train_set, valid, sentences_size, embedded_size, vocabulary_size
 
 
-def sym_gen(batch_size, sentence_size, num_embed, vocab_size,
-            num_label=2, filter_list=[3, 4, 5], num_filter=100,
+def sym_gen(batch_size, sentences_size, num_embed, vocabulary_size,
+            num_label=2, filter_list=None, num_filter=100,
             dropout=0.0, pre_trained_word2vec=False):
+    """Generate network symbol
+
+    Parameters
+    ----------
+    batch_size: int
+    sentences_size: int
+    num_embed: int
+    vocabulary_size: int
+    num_label: int
+    filter_list: list
+    num_filter: int
+    dropout: int
+    pre_trained_word2vec: boolean
+                        identify the pre-trained layers or not
+    Returns
+    ----------
+    sm: symbol
+    data: list of str
+        data names
+    softmax_label: list of str
+        label names
+    """
     input_x = mx.sym.Variable('data')
     input_y = mx.sym.Variable('softmax_label')
 
     # embedding layer
     if not pre_trained_word2vec:
-        embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed, name='vocab_embed')
-        conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentence_size, num_embed))
+        embed_layer = mx.sym.Embedding(data=input_x,
+                                       input_dim=vocabulary_size,
+                                       output_dim=num_embed,
+                                       name='vocab_embed')
+        conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentences_size, num_embed))
     else:
         conv_input = input_x
 
@@ -141,7 +205,7 @@ def sym_gen(batch_size, sentence_size, num_embed, vocab_size,
     for i, filter_size in enumerate(filter_list):
         convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
         relui = mx.sym.Activation(data=convi, act_type='relu')
-        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1, 1))
+        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentences_size - filter_size + 1, 1), stride=(1, 1))
         pooled_outputs.append(pooli)
 
     # combine all pooled outputs
@@ -170,10 +234,27 @@ def sym_gen(batch_size, sentence_size, num_embed, vocab_size,
     return sm, ('data',), ('softmax_label',)
 
 
-def train(symbol, train_iter, valid_iter, data_names, label_names):
-    devs = mx.cpu() if args.gpus is None or args.gpus is '' else [
-        mx.gpu(int(i)) for i in args.gpus.split(',')]
-    module = mx.mod.Module(symbol, data_names=data_names, label_names=label_names, context=devs)
+def train(symbol_data, train_iterator, valid_iterator, data_column_names, target_names):
+    """Train cnn model
+
+    Parameters
+    ----------
+    symbol_data: symbol
+    train_iterator: DataIter
+                    Train DataIter
+    valid_iterator: DataIter
+                    Valid DataIter
+    data_column_names: list of str
+                       Defaults to ('data') for a typical model used in image classification
+    target_names: list of str
+                  Defaults to ('softmax_label') for a typical model used in image classification
+    """
+    devs = mx.cpu()  # default setting
+    if args.gpus is not None:
+        for i in args.gpus.split(','):
+            mx.gpu(int(i))
+        devs = mx.gpu()
+    module = mx.mod.Module(symbol_data, data_names=data_column_names, label_names=target_names, context=devs)
 
     init_params = {
         'vocab_embed_weight': {'uniform': 0.1},
@@ -185,7 +266,7 @@ def train(symbol, train_iter, valid_iter, data_names, label_names):
         'cls_weight': {'uniform': 0.1}, 'cls_bias': {'costant': 0},
     }
     # custom init_params
-    module.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
+    module.bind(data_shapes=train_iterator.provide_data, label_shapes=train_iterator.provide_label)
     module.init_params(CustomInit(init_params))
     lr_sch = mx.lr_scheduler.FactorScheduler(step=25000, factor=0.999)
     module.init_optimizer(
@@ -195,8 +276,8 @@ def norm_stat(d):
         return mx.nd.norm(d) / np.sqrt(d.size)
     mon = mx.mon.Monitor(25000, norm_stat)
 
-    module.fit(train_data=train_iter,
-               eval_data=valid_iter,
+    module.fit(train_data=train_iterator,
+               eval_data=valid_iterator,
                eval_metric='acc',
                kvstore=args.kv_store,
                monitor=mon,
@@ -207,8 +288,7 @@ def norm_stat(d):
 
 @mx.init.register
 class CustomInit(Initializer):
-    """
-    https://mxnet.incubator.apache.org/api/python/optimization.html#mxnet.initializer.register
+    """https://mxnet.incubator.apache.org/api/python/optimization.html#mxnet.initializer.register
     Create and register a custom initializer that
     Initialize the weight and bias with custom requirements
 
diff --git a/example/cnn_text_classification/data_helpers.py b/example/cnn_text_classification/data_helpers.py
index b6fe1e6917a3..093da7bf32bc 100644
--- a/example/cnn_text_classification/data_helpers.py
+++ b/example/cnn_text_classification/data_helpers.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+"""Help functions to support for implementing CNN + Highway Network for Text Classification in MXNet"""
+
 import itertools
 import os
 import re
@@ -27,8 +29,7 @@
 
 
 def clean_str(string):
-    """
-    Tokenization/string cleaning for all datasets except for SST.
+    """Tokenization/string cleaning for all datasets except for SST.
     Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
     """
     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
@@ -40,16 +41,15 @@ def clean_str(string):
     string = re.sub(r"\'ll", " \'ll", string)
     string = re.sub(r",", " , ", string)
     string = re.sub(r"!", " ! ", string)
-    string = re.sub(r"\(", " \( ", string)
-    string = re.sub(r"\)", " \) ", string)
-    string = re.sub(r"\?", " \? ", string)
+    string = re.sub(r"\(", r" \( ", string)
+    string = re.sub(r"\)", r" \) ", string)
+    string = re.sub(r"\?", r" \? ", string)
     string = re.sub(r"\s{2,}", " ", string)
     return string.strip().lower()
 
 
 def load_data_and_labels():
-    """
-    Loads MR polarity data from files, splits the data into words and generates labels.
+    """Loads MR polarity data from files, splits the data into words and generates labels.
     Returns split sentences and labels.
     """
     # Load data from files
@@ -75,14 +75,12 @@ def load_data_and_labels():
 
 
 def pad_sentences(sentences, padding_word="</s>"):
-    """
-    Pads all sentences to the same length. The length is defined by the longest sentence.
+    """Pads all sentences to the same length. The length is defined by the longest sentence.
     Returns padded sentences.
     """
     sequence_length = max(len(x) for x in sentences)
     padded_sentences = []
-    for i in range(len(sentences)):
-        sentence = sentences[i]
+    for i, sentence in enumerate(sentences):
         num_padding = sequence_length - len(sentence)
         new_sentence = sentence + [padding_word] * num_padding
         padded_sentences.append(new_sentence)
@@ -90,8 +88,7 @@ def pad_sentences(sentences, padding_word="</s>"):
 
 
 def build_vocab(sentences):
-    """
-    Builds a vocabulary mapping from word to index based on the sentences.
+    """Builds a vocabulary mapping from word to index based on the sentences.
     Returns vocabulary mapping and inverse vocabulary mapping.
     """
     # Build vocabulary
@@ -104,44 +101,41 @@ def build_vocab(sentences):
 
 
 def build_input_data(sentences, labels, vocabulary):
-    """
-    Maps sentencs and labels to vectors based on a vocabulary.
-    """
+    """Maps sentencs and labels to vectors based on a vocabulary."""
     x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
     y = np.array(labels)
     return [x, y]
 
-def build_input_data_with_word2vec(sentences, labels, word2vec):
+
+def build_input_data_with_word2vec(sentences, labels, word2vec_list):
     """Map sentences and labels to vectors based on a pretrained word2vec"""
     x_vec = []
     for sent in sentences:
         vec = []
         for word in sent:
-            if word in word2vec:
-                vec.append(word2vec[word])
+            if word in word2vec_list:
+                vec.append(word2vec_list[word])
             else:
-                vec.append(word2vec['</s>'])
+                vec.append(word2vec_list['</s>'])
         x_vec.append(vec)
     x_vec = np.array(x_vec)
     y_vec = np.array(labels)
     return [x_vec, y_vec]
 
 
-def load_data_with_word2vec(word2vec):
-    """
-    Loads and preprocessed data for the MR dataset.
+def load_data_with_word2vec(word2vec_list):
+    """Loads and preprocessed data for the MR dataset.
     Returns input vectors, labels, vocabulary, and inverse vocabulary.
     """
     # Load and preprocess data
     sentences, labels = load_data_and_labels()
     sentences_padded = pad_sentences(sentences)
     # vocabulary, vocabulary_inv = build_vocab(sentences_padded)
-    return build_input_data_with_word2vec(sentences_padded, labels, word2vec)
+    return build_input_data_with_word2vec(sentences_padded, labels, word2vec_list)
 
 
 def load_data():
-    """
-    Loads and preprocessed data for the MR dataset.
+    """Loads and preprocessed data for the MR dataset.
     Returns input vectors, labels, vocabulary, and inverse vocabulary.
     """
     # Load and preprocess data
@@ -153,9 +147,7 @@ def load_data():
 
 
 def batch_iter(data, batch_size, num_epochs):
-    """
-    Generates a batch iterator for a dataset.
-    """
+    """Generates a batch iterator for a dataset."""
     data = np.array(data)
     data_size = len(data)
     num_batches_per_epoch = int(len(data)/batch_size) + 1
@@ -170,18 +162,19 @@ def batch_iter(data, batch_size, num_epochs):
 
 
 def load_pretrained_word2vec(infile):
+    """Load the pre-trained word2vec from file."""
     if isinstance(infile, str):
         infile = open(infile)
 
-    word2vec = {}
+    word2vec_list = {}
     for idx, line in enumerate(infile):
         if idx == 0:
             vocab_size, dim = line.strip().split()
         else:
             tks = line.strip().split()
-            word2vec[tks[0]] = map(float, tks[1:])
+            word2vec_list[tks[0]] = map(float, tks[1:])
 
-    return word2vec
+    return word2vec_list
 
 
 def load_google_word2vec(path):
diff --git a/example/deep-embedded-clustering/model.py b/example/deep-embedded-clustering/model.py
index 9b6185c9fd18..b388c551387e 100644
--- a/example/deep-embedded-clustering/model.py
+++ b/example/deep-embedded-clustering/model.py
@@ -18,8 +18,9 @@
 # pylint: disable=missing-docstring
 from __future__ import print_function
 
-import mxnet as mx
 import numpy as np
+import mxnet as mx
+
 try:
     import cPickle as pickle
 except ImportError:
@@ -53,7 +54,7 @@ def extract_feature(sym, args, auxs, data_iter, N, xpu=mx.cpu()):
 
 
 class MXModel(object):
-    def __init__(self, xpu=mx.cpu(), *args, **kwargs):
+    def __init__(self, *args, xpu=mx.cpu(), **kwargs):
         self.xpu = xpu
         self.loss = None
         self.args = {}
diff --git a/example/deep-embedded-clustering/solver.py b/example/deep-embedded-clustering/solver.py
index 567c78eeb06c..79fe5c69add7 100644
--- a/example/deep-embedded-clustering/solver.py
+++ b/example/deep-embedded-clustering/solver.py
@@ -19,9 +19,8 @@
 from __future__ import print_function
 
 import logging
-
-import mxnet as mx
 import numpy as np
+import mxnet as mx
 
 
 class Monitor(object):
@@ -148,4 +147,4 @@ def solve(self, xpu, sym, args, args_grad, auxs,
             if self.iter_end_callback is not None:
                 if self.iter_end_callback(i):
                     return
-            exe.outputs[0].wait_to_read()
\ No newline at end of file
+            exe.outputs[0].wait_to_read()