diff --git a/example/bayesian-methods/bdk_demo.py b/example/bayesian-methods/bdk_demo.py
index bd883d226a2d..a59b4df4fe44 100644
--- a/example/bayesian-methods/bdk_demo.py
+++ b/example/bayesian-methods/bdk_demo.py
@@ -58,7 +58,7 @@ def backward(self, out_grad, in_data, out_data, in_grad):
 
 
 class LogSoftmax(mx.operator.NumpyOp):
-    """Generate helper functions to evaluate softmax loss function"""
+    """Generate helper functions to calculate the logarithm of softmax"""
     def __init__(self):
         super(LogSoftmax, self).__init__(False)
 
diff --git a/example/capsnet/capsulenet.py b/example/capsnet/capsulenet.py
index 05df9cdc56c4..4d455dbc504c 100644
--- a/example/capsnet/capsulenet.py
+++ b/example/capsnet/capsulenet.py
@@ -14,7 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Generate MXNet implementation of CapsNet"""
+"""Generate MXNet implementation of CapsNet
+Reference 1: https://www.cs.toronto.edu/~fritz/absps/transauto6.pdf
+Reference 2: https://arxiv.org/pdf/1710.09829.pdf
+"""
 import os
 import re
 import gzip
@@ -190,7 +193,7 @@ def __call__(self, num_update):
 
 
 def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay):
-    """Run training to CapsNet"""
+    """Perform CapsNet training"""
     summary_writer = SummaryWriter(args.tblog_dir)
     lr_scheduler = SimpleLRScheduler(learning_rate)
     optimizer_params = {'lr_scheduler': lr_scheduler}
diff --git a/example/ctc/multiproc_data.py b/example/ctc/multiproc_data.py
index 313ab4eec840..f4c667621f70 100644
--- a/example/ctc/multiproc_data.py
+++ b/example/ctc/multiproc_data.py
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
 """Contains a class for handling multi-process data generation"""
 
 from __future__ import print_function
diff --git a/example/gluon/dc_gan/dcgan.py b/example/gluon/dc_gan/dcgan.py
index 970c35d54df4..93af13ababf3 100644
--- a/example/gluon/dc_gan/dcgan.py
+++ b/example/gluon/dc_gan/dcgan.py
@@ -14,28 +14,27 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Generate MXNet implementation of Deep Convolutional Generative Adversarial Networks"""
 
-import matplotlib as mpl
-mpl.use('Agg')
-from matplotlib import pyplot as plt
-
+import logging
+from datetime import datetime
 import argparse
+import os
+import time
+import numpy as np
+from matplotlib import pyplot as plt
+import matplotlib as mpl
 import mxnet as mx
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet import autograd
-import numpy as np
-import logging
-from datetime import datetime
-import os
-import time
-
 from inception_score import get_inception_score
 
+mpl.use('Agg')
+
 
 def fill_buf(buf, i, img, shape):
-    """
-    Reposition the images generated by the generator so that it can be saved as picture matrix.
+    """Reposition the images generated by the generator so that it can be saved as picture matrix.
     :param buf: the images metric
     :param i: index of each image
     :param img: images generated by generator once
@@ -48,12 +47,10 @@ def fill_buf(buf, i, img, shape):
     sx = (i%m)*shape[0]
     sy = (i//m)*shape[1]
     buf[sy:sy+shape[1], sx:sx+shape[0], :] = img
-    return None
 
 
 def visual(title, X, name):
-    """
-    Image visualization and preservation
+    """Image visualization and preservation
     :param title: title
     :param X: images to visualized
     :param name: saved picture`s name
@@ -79,9 +76,11 @@ def visual(title, X, name):
 parser.add_argument('--batch-size', type=int, default=64, help='input batch size, default is 64')
 parser.add_argument('--nz', type=int, default=100, help='size of the latent z vector, default is 100')
 parser.add_argument('--ngf', type=int, default=64, help='the channel of each generator filter layer, default is 64.')
-parser.add_argument('--ndf', type=int, default=64, help='the channel of each descriminator filter layer, default is 64.')
+parser.add_argument('--ndf', type=int, default=64, help='the channel of each descriminator filter layer, '
+                                                        'default is 64.')
 parser.add_argument('--nepoch', type=int, default=25, help='number of epochs to train for, default is 25.')
-parser.add_argument('--niter', type=int, default=10, help='save generated images and inception_score per niter iters, default is 100.')
+parser.add_argument('--niter', type=int, default=10, help='save generated images and inception_score per niter iters, '
+                                                          'default is 100.')
 parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002')
 parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')
 parser.add_argument('--cuda', action='store_true', help='enables cuda')
@@ -89,7 +88,8 @@ def visual(title, X, name):
 parser.add_argument('--netD', default='', help="path to netD (to continue training)")
 parser.add_argument('--outf', default='./results', help='folder to output images and model checkpoints')
 parser.add_argument('--check-point', default=True, help="save results at each epoch or not")
-parser.add_argument('--inception_score', type=bool, default=True, help='To record the inception_score, default is True.')
+parser.add_argument('--inception_score', type=bool, default=True, help='To record the inception_score, '
+                                                                       'default is True.')
 
 opt = parser.parse_args()
 print(opt)
@@ -115,6 +115,7 @@ def visual(title, X, name):
 
 
 def transformer(data, label):
+    """Get the translation of images"""
     # resize to 64x64
     data = mx.image.imresize(data, 64, 64)
     # transpose from (64, 64, 3) to (3, 64, 64)
@@ -128,7 +129,17 @@ def transformer(data, label):
 
 
 # get dataset with the batch_size num each time
-def get_dataset(dataset):
+def get_dataset(dataset_name):
+    """Load the dataset and split it to train/valid data
+
+    :param dataset_name: string
+
+    Returns:
+    train_data: int array
+        training dataset
+    val_data: int array
+        valid dataset
+    """
     # mnist
     if dataset == "mnist":
         train_data = gluon.data.DataLoader(
@@ -152,6 +163,7 @@ def get_dataset(dataset):
 
 
 def get_netG():
+    """Get net G"""
     # build the generator
     netG = nn.Sequential()
     with netG.name_scope():
@@ -180,6 +192,7 @@ def get_netG():
 
 
 def get_netD():
+    """Get the netD"""
     # build the discriminator
     netD = nn.Sequential()
     with netD.name_scope():
@@ -206,6 +219,7 @@ def get_netD():
 
 
 def get_configurations(netG, netD):
+    """Get configurations for net"""
     # loss
     loss = gluon.loss.SoftmaxCrossEntropyLoss()
 
@@ -233,6 +247,7 @@ def ins_save(inception_score):
 
 # main function
 def main():
+    """Entry point to dcgan"""
     print("|------- new changes!!!!!!!!!")
     # to get the dataset and net configuration
     train_data, val_data = get_dataset(dataset)
@@ -300,7 +315,7 @@ def main():
 
             name, acc = metric.get()
             logging.info('discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d'
-                         % (mx.nd.mean(errD).asscalar(), mx.nd.mean(errG).asscalar(), acc, iter, epoch))
+                         , mx.nd.mean(errD).asscalar(), mx.nd.mean(errG).asscalar(), acc, iter, epoch)
             if iter % niter == 0:
                 visual('gout', fake.asnumpy(), name=os.path.join(outf, 'fake_img_iter_%d.png' % iter))
                 visual('data', data.asnumpy(), name=os.path.join(outf, 'real_img_iter_%d.png' % iter))
@@ -316,13 +331,13 @@ def main():
 
         name, acc = metric.get()
         metric.reset()
-        logging.info('\nbinary training acc at epoch %d: %s=%f' % (epoch, name, acc))
-        logging.info('time: %f' % (time.time() - tic))
+        logging.info('\nbinary training acc at epoch %d: %s=%f', epoch, name, acc)
+        logging.info('time: %f', time.time() - tic)
 
         # save check_point
         if check_point:
-            netG.save_parameters(os.path.join(outf,'generator_epoch_%d.params' %epoch))
-            netD.save_parameters(os.path.join(outf,'discriminator_epoch_%d.params' % epoch))
+            netG.save_parameters(os.path.join(outf, 'generator_epoch_%d.params' %epoch))
+            netD.save_parameters(os.path.join(outf, 'discriminator_epoch_%d.params' % epoch))
 
     # save parameter
     netG.save_parameters(os.path.join(outf, 'generator.params'))
@@ -335,6 +350,6 @@ def main():
 
 if __name__ == '__main__':
     if opt.inception_score:
-        print("Use inception_score to metric this DCgan model, the reusult is save as a picture named \"inception_score.png\"!")
+        print("Use inception_score to metric this DCgan model, the reusult is save as a picture "
+              "named \"inception_score.png\"!")
     main()
-
diff --git a/example/gluon/lstm_crf/lstm_crf.py b/example/gluon/lstm_crf/lstm_crf.py
index 9c2218577312..011dcfbc4aea 100644
--- a/example/gluon/lstm_crf/lstm_crf.py
+++ b/example/gluon/lstm_crf/lstm_crf.py
@@ -14,46 +14,50 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""This example demonstrates how the LSTM-CRF model can be implemented
+in Gluon to perform noun-phrase chunking as a sequence labeling task.
+"""
+import sys
 import mxnet as mx
 from mxnet import autograd as ag, ndarray as nd, gluon
 from mxnet.gluon import Block, nn, rnn
 import mxnet.optimizer as optim
-import sys
-
-# This example demonstrates how the LSTM-CRF model can be implemented 
-# in Gluon to perform noun-phrase chunking as a sequence labeling task.
 
 mx.random.seed(1)
 
+
 # Helper functions to make the code more readable.
 def to_scalar(x):
     return int(x.asscalar())
 
+
 def argmax(vec):
     # return the argmax as a python int
     idx = nd.argmax(vec, axis=1)
     return to_scalar(idx)
 
-def prepare_sequence(seq, word2idx):
-    return nd.array([word2idx[w] for w in seq])
+
+def prepare_sequence(seq, word2Idx):
+    return nd.array([word2Idx[w] for w in seq])
+
 
 # Compute log sum exp is numerically more stable than multiplying probabilities
 def log_sum_exp(vec):
     max_score = nd.max(vec).asscalar()
     return nd.log(nd.sum(nd.exp(vec - max_score))) + max_score
 
+
 # Model
 class BiLSTM_CRF(Block):
-    def __init__(self, vocab_size, tag2idx, embedding_dim, hidden_dim):
+    """Get BiLSTM_CRF model"""
+    def __init__(self, vocab_size, tag2Idx, embedding_dim, hidden_dim):
         super(BiLSTM_CRF, self).__init__()
         with self.name_scope():
             self.embedding_dim = embedding_dim
             self.hidden_dim = hidden_dim
             self.vocab_size = vocab_size
-            self.tag2idx = tag2idx
-            self.tagset_size = len(tag2idx)
-
+            self.tag2idx = tag2Idx
+            self.tagset_size = len(tag2Idx)
             self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
             self.lstm = rnn.LSTM(hidden_dim // 2, num_layers=1, bidirectional=True)
 
@@ -62,9 +66,7 @@ def __init__(self, vocab_size, tag2idx, embedding_dim, hidden_dim):
 
             # Matrix of transition parameters.  Entry i,j is the score of
             # transitioning *to* i *from* j.
-            self.transitions = self.params.get("crf_transition_matrix", 
-                                               shape=(self.tagset_size, self.tagset_size))
-            
+            self.transitions = self.params.get("crf_transition_matrix", shape=(self.tagset_size, self.tagset_size))
             self.hidden = self.init_hidden()
 
     def init_hidden(self):
@@ -98,24 +100,25 @@ def _forward_alg(self, feats):
         alpha = log_sum_exp(terminal_var)
         return alpha
 
-    def _get_lstm_features(self, sentence):
+    def _get_lstm_features(self, sentences):
         self.hidden = self.init_hidden()
-        length = sentence.shape[0]
-        embeds = self.word_embeds(sentence).reshape((length, 1, -1))
+        length = sentences.shape[0]
+        embeds = self.word_embeds(sentences).reshape((length, 1, -1))
         lstm_out, self.hidden = self.lstm(embeds, self.hidden)
         lstm_out = lstm_out.reshape((length, self.hidden_dim))
         lstm_feats = self.hidden2tag(lstm_out)
         return nd.split(lstm_feats, num_outputs=length, axis=0, squeeze_axis=True)
 
-    def _score_sentence(self, feats, tags):
+    def _score_sentence(self, feats, tags_array):
         # Gives the score of a provided tag sequence
         score = nd.array([0])
-        tags = nd.concat(nd.array([self.tag2idx[START_TAG]]), *tags, dim=0)
-        for i, feat in enumerate(feats):
+        tags_array = nd.concat(nd.array([self.tag2idx[START_TAG]]), *tags_array, dim=0)
+        for idx, feat in enumerate(feats):
             score = score + \
-                self.transitions.data()[to_scalar(tags[i+1]), to_scalar(tags[i])] + feat[to_scalar(tags[i+1])]
+                    self.transitions.data()[to_scalar(tags_array[idx+1]),
+                                            to_scalar(tags_array[idx])] + feat[to_scalar(tags_array[idx+1])]
         score = score + self.transitions.data()[self.tag2idx[STOP_TAG],
-                                         to_scalar(tags[int(tags.shape[0]-1)])]
+                                                to_scalar(tags.array[int(tags_array.shape[0]-1)])]
         return score
 
     def _viterbi_decode(self, feats):
@@ -160,20 +163,21 @@ def _viterbi_decode(self, feats):
         best_path.reverse()
         return path_score, best_path
 
-    def neg_log_likelihood(self, sentence, tags):
-        feats = self._get_lstm_features(sentence)
+    def neg_log_likelihood(self, sentences, tags_list):
+        feats = self._get_lstm_features(sentences)
         forward_score = self._forward_alg(feats)
-        gold_score = self._score_sentence(feats, tags)
+        gold_score = self._score_sentence(feats, tags_list)
         return forward_score - gold_score
 
-    def forward(self, sentence):  # dont confuse this with _forward_alg above.
+    def forward(self, sentences):  # dont confuse this with _forward_alg above.
         # Get the emission scores from the BiLSTM
-        lstm_feats = self._get_lstm_features(sentence)
+        lstm_feats = self._get_lstm_features(sentences)
 
         # Find the best path, given the features.
         score, tag_seq = self._viterbi_decode(lstm_feats)
         return score, tag_seq
 
+
 # Run training
 START_TAG = "<START>"
 STOP_TAG = "<STOP>"
@@ -210,6 +214,7 @@ def forward(self, sentence):  # dont confuse this with _forward_alg above.
 for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
 
     neg_log_likelihood_acc = 0.
+    iter = 0
     for i, (sentence, tags) in enumerate(training_data):
         # Step 1. Get our inputs ready for the network, that is,
         # turn them into Variables of word indices.
@@ -226,7 +231,8 @@ def forward(self, sentence):  # dont confuse this with _forward_alg above.
             neg_log_likelihood.backward()
         optimizer.step(1)
         neg_log_likelihood_acc += neg_log_likelihood.mean()
-    print("Epoch [{}], Negative Log Likelihood {:.4f}".format(epoch, neg_log_likelihood_acc.asscalar()/(i+1)))
+        iter = i
+    print("Epoch [{}], Negative Log Likelihood {:.4f}".format(epoch, neg_log_likelihood_acc.asscalar()/(iter+1)))
 
 # Check predictions after training
 precheck_sent = prepare_sequence(training_data[0][0], word2idx)