diff --git a/example/bayesian-methods/README.md b/example/bayesian-methods/README.md
index ec9e8be86927..fc35b94219d7 100644
--- a/example/bayesian-methods/README.md
+++ b/example/bayesian-methods/README.md
@@ -11,3 +11,27 @@ and *Bayesian Dark Knowledge (BDK)* [<cite>(Balan, Rathod, Murphy and Welling, 2
 **bdk.ipynb** shows how to use MXNet to implement the DistilledSGLD algorithm in Bayesian Dark Knowledge.
 
 **bdk_demo.py** contains scripts (more than the notebook) related to Bayesian Dark Knowledge. Use `python bdk_demo.py -d 1 -l 2 -t 50000` to run classification on MNIST. 
+
+View parameters we can use with the following command.
+
+```shell
+python bdk_demo.py -h
+
+
+usage: bdk_demo.py [-h] [-d DATASET] [-l ALGORITHM] [-t TRAINING] [--gpu GPU]
+
+Examples in the paper [NIPS2015]Bayesian Dark Knowledge and [ICML2011]Bayesian
+Learning via Stochastic Gradient Langevin Dynamics
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -d DATASET, --dataset DATASET
+                        Dataset to use. 0 --> TOY, 1 --> MNIST, 2 -->
+                        Synthetic Data in the SGLD paper
+  -l ALGORITHM, --algorithm ALGORITHM
+                        Type of algorithm to use. 0 --> SGD, 1 --> SGLD,
+                        other-->DistilledSGLD
+  -t TRAINING, --training TRAINING
+                        Number of training samples
+  --gpu GPU             0 to use GPU, not set to use CPU
+```
diff --git a/example/bayesian-methods/bdk_demo.py b/example/bayesian-methods/bdk_demo.py
index 145dac10e2a6..cd39bfd2a7c9 100644
--- a/example/bayesian-methods/bdk_demo.py
+++ b/example/bayesian-methods/bdk_demo.py
@@ -156,34 +156,34 @@ def get_toy_sym(teacher=True, teacher_noise_precision=None):
     return net
 
 
-def dev():
-    return mx.gpu()
+def dev(gpu_id=None):
+    return mx.gpu(gpu_id) if gpu_id else mx.cpu()
 
 
-def run_mnist_SGD(training_num=50000):
+def run_mnist_SGD(training_num=50000, gpu_id=None):
     X, Y, X_test, Y_test = load_mnist(training_num)
     minibatch_size = 100
     net = get_mnist_sym()
     data_shape = (minibatch_size,) + X.shape[1::]
-    data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                   'softmax_label': nd.zeros((minibatch_size,), ctx=dev())}
+    data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                   'softmax_label': nd.zeros((minibatch_size,), ctx=dev(gpu_id))}
     initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
-    exe, exe_params, _ = SGD(sym=net, dev=dev(), data_inputs=data_inputs, X=X, Y=Y,
+    exe, exe_params, _ = SGD(sym=net, dev=dev(gpu_id), data_inputs=data_inputs, X=X, Y=Y,
                              X_test=X_test, Y_test=Y_test,
                              total_iter_num=1000000,
                              initializer=initializer,
                              lr=5E-6, prior_precision=1.0, minibatch_size=100)
 
 
-def run_mnist_SGLD(training_num=50000):
+def run_mnist_SGLD(training_num=50000, gpu_id=None):
     X, Y, X_test, Y_test = load_mnist(training_num)
     minibatch_size = 100
     net = get_mnist_sym()
     data_shape = (minibatch_size,) + X.shape[1::]
-    data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                   'softmax_label': nd.zeros((minibatch_size,), ctx=dev())}
+    data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                   'softmax_label': nd.zeros((minibatch_size,), ctx=dev(gpu_id))}
     initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
-    exe, sample_pool = SGLD(sym=net, dev=dev(), data_inputs=data_inputs, X=X, Y=Y,
+    exe, sample_pool = SGLD(sym=net, dev=dev(gpu_id), data_inputs=data_inputs, X=X, Y=Y,
                             X_test=X_test, Y_test=Y_test,
                             total_iter_num=1000000,
                             initializer=initializer,
@@ -191,7 +191,7 @@ def run_mnist_SGLD(training_num=50000):
                             thin_interval=100, burn_in_iter_num=1000)
 
 
-def run_mnist_DistilledSGLD(training_num=50000):
+def run_mnist_DistilledSGLD(training_num=50000, gpu_id=None):
     X, Y, X_test, Y_test = load_mnist(training_num)
     minibatch_size = 100
     if training_num >= 10000:
@@ -214,10 +214,10 @@ def run_mnist_DistilledSGLD(training_num=50000):
     logsoftmax = LogSoftmax()
     student_net = get_mnist_sym(output_op=logsoftmax, num_hidden=num_hidden)
     data_shape = (minibatch_size,) + X.shape[1::]
-    teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                           'softmax_label': nd.zeros((minibatch_size,), ctx=dev())}
-    student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                           'softmax_label': nd.zeros((minibatch_size, 10), ctx=dev())}
+    teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                           'softmax_label': nd.zeros((minibatch_size,), ctx=dev(gpu_id))}
+    student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                           'softmax_label': nd.zeros((minibatch_size, 10), ctx=dev(gpu_id))}
     teacher_initializer = BiasXavier(factor_type="in", magnitude=1)
     student_initializer = BiasXavier(factor_type="in", magnitude=1)
     student_exe, student_params, _ = \
@@ -231,17 +231,17 @@ def run_mnist_DistilledSGLD(training_num=50000):
                       teacher_learning_rate=teacher_learning_rate,
                       student_learning_rate=student_learning_rate,
                       teacher_prior_precision=teacher_prior, student_prior_precision=student_prior,
-                      perturb_deviation=perturb_deviation, minibatch_size=100, dev=dev())
+                      perturb_deviation=perturb_deviation, minibatch_size=100, dev=dev(gpu_id))
 
 
-def run_toy_SGLD():
+def run_toy_SGLD(gpu_id=None):
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = 1
     teacher_noise_precision = 1.0 / 9.0
     net = get_toy_sym(True, teacher_noise_precision)
     data_shape = (minibatch_size,) + X.shape[1::]
-    data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                   'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())}
+    data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                   'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))}
     initializer = mx.init.Uniform(0.07)
     exe, params, _ = \
         SGLD(sym=net, data_inputs=data_inputs,
@@ -253,20 +253,20 @@ def run_toy_SGLD():
              burn_in_iter_num=1000,
              thin_interval=10,
              task='regression',
-             minibatch_size=minibatch_size, dev=dev())
+             minibatch_size=minibatch_size, dev=dev(gpu_id))
 
 
-def run_toy_DistilledSGLD():
+def run_toy_DistilledSGLD(gpu_id=None):
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = 1
     teacher_noise_precision = 1.0
     teacher_net = get_toy_sym(True, teacher_noise_precision)
     student_net = get_toy_sym(False)
     data_shape = (minibatch_size,) + X.shape[1::]
-    teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                           'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())}
-    student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev())}
-    #                   'softmax_label': nd.zeros((minibatch_size, 10), ctx=dev())}
+    teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                           'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))}
+    student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id))}
+
     teacher_initializer = mx.init.Uniform(0.07)
     student_initializer = mx.init.Uniform(0.07)
     student_grad_f = lambda student_outputs, teacher_pred: \
@@ -284,21 +284,21 @@ def run_toy_DistilledSGLD():
                       student_grad_f=student_grad_f,
                       teacher_prior_precision=0.1, student_prior_precision=0.001,
                       perturb_deviation=0.1, minibatch_size=minibatch_size, task='regression',
-                      dev=dev())
+                      dev=dev(gpu_id))
 
 
-def run_toy_HMC():
+def run_toy_HMC(gpu_id=None):
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = Y.shape[0]
     noise_precision = 1 / 9.0
     net = get_toy_sym(True, noise_precision)
     data_shape = (minibatch_size,) + X.shape[1::]
-    data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                   'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())}
+    data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                   'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))}
     initializer = mx.init.Uniform(0.07)
     sample_pool = HMC(net, data_inputs=data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test,
                       sample_num=300000, initializer=initializer, prior_precision=1.0,
-                      learning_rate=1E-3, L=10, dev=dev())
+                      learning_rate=1E-3, L=10, dev=dev(gpu_id))
 
 
 def run_synthetic_SGLD():
@@ -350,21 +350,22 @@ def run_synthetic_SGLD():
                         help="Type of algorithm to use. 0 --> SGD, 1 --> SGLD, other-->DistilledSGLD")
     parser.add_argument("-t", "--training", type=int, default=50000,
                         help="Number of training samples")
+    parser.add_argument("--gpu", type=int, help="0 to use GPU, not set to use CPU")
     args = parser.parse_args()
     training_num = args.training
     if args.dataset == 1:
         if 0 == args.algorithm:
-            run_mnist_SGD(training_num)
+            run_mnist_SGD(training_num, gpu_id=args.gpu)
         elif 1 == args.algorithm:
-            run_mnist_SGLD(training_num)
+            run_mnist_SGLD(training_num, gpu_id=args.gpu)
         else:
-            run_mnist_DistilledSGLD(training_num)
+            run_mnist_DistilledSGLD(training_num, gpu_id=args.gpu)
     elif args.dataset == 0:
         if 1 == args.algorithm:
-            run_toy_SGLD()
+            run_toy_SGLD(gpu_id=args.gpu)
         elif 2 == args.algorithm:
-            run_toy_DistilledSGLD()
+            run_toy_DistilledSGLD(gpu_id=args.gpu)
         elif 3 == args.algorithm:
-            run_toy_HMC()
+            run_toy_HMC(gpu_id=args.gpu)
     else:
         run_synthetic_SGLD()
diff --git a/example/fcn-xs/README.md b/example/fcn-xs/README.md
index 145aa31cb700..49c57fc08eaf 100644
--- a/example/fcn-xs/README.md
+++ b/example/fcn-xs/README.md
@@ -40,14 +40,33 @@ this is the fully convolution style of the origin
 Once you completed all these steps, your working directory should contain a ```.\VOC2012``` directory, which contains the following: ```JPEGImages folder```, ```SegmentationClass folder```, ```train.lst```, ```val.lst```
 
 #### Step 3: Train the fcn-xs model
-* Based on your hardware, configure GPU or CPU for training in `fcn_xs.py`. It is recommended to use GPU due to the computational complexity and data load.
-```python
-# ctx = mx.cpu(0)
-ctx = mx.gpu(0)
+* Based on your hardware, configure CPU or GPU for training by parameter ```--gpu```. It is recommended to use GPU due to the computational complexity and data load. 
+View parameters we can use with the following command.
+```shell
+python fcn_xs.py -h
+
+
+usage: fcn_xs.py [-h] [--model MODEL] [--prefix PREFIX] [--epoch EPOCH]
+                 [--init-type INIT_TYPE] [--retrain] [--gpu GPU]
+
+Convert vgg16 model to vgg16fc model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --model MODEL         The type of fcn-xs model, e.g. fcnxs, fcn16s, fcn8s.
+  --prefix PREFIX       The prefix(include path) of vgg16 model with mxnet
+                        format.
+  --epoch EPOCH         The epoch number of vgg16 model.
+  --init-type INIT_TYPE
+                        the init type of fcn-xs model, e.g. vgg16, fcnxs
+  --retrain             true means continue training.
+  --gpu GPU             0 to use GPU, not set to use CPU
 ```
+
 * It is recommended to train fcn-32s and fcn-16s before training the fcn-8s model
 
 To train the fcn-32s model, run the following:
+
 ```shell
 python -u fcn_xs.py --model=fcn32s --prefix=VGG_FC_ILSVRC_16_layers --epoch=74 --init-type=vgg16
 ```
diff --git a/example/fcn-xs/fcn_xs.py b/example/fcn-xs/fcn_xs.py
index 53244a1759c3..5b799f32e46e 100644
--- a/example/fcn-xs/fcn_xs.py
+++ b/example/fcn-xs/fcn_xs.py
@@ -28,9 +28,10 @@
 
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
-ctx = mx.gpu(0)
+
 
 def main():
+    ctx = mx.cpu() if not args.gpu else mx.gpu(args.gpu)
     fcnxs = symbol_fcnxs.get_fcn32s_symbol(numclass=21, workspace_default=1536)
     fcnxs_model_prefix = "model_pascal/FCN32s_VGG16"
     if args.model == "fcn16s":
@@ -85,6 +86,7 @@ def main():
         help='the init type of fcn-xs model, e.g. vgg16, fcnxs')
     parser.add_argument('--retrain', action='store_true', default=False,
         help='true means continue training.')
+    parser.add_argument("--gpu", type=int, help="0 to use GPU, not set to use CPU")
     args = parser.parse_args()
     logging.info(args)
     main()
diff --git a/example/rcnn/README.md b/example/rcnn/README.md
index b5284183d160..5e6127ccb08d 100644
--- a/example/rcnn/README.md
+++ b/example/rcnn/README.md
@@ -9,7 +9,7 @@ For a gluon imperative version, checkout https://github.com/dmlc/gluon-cv.
 
 ### Out-of-box inference models
 Download any of the following models to the current directory and run `python3 demo.py --dataset $Dataset$ --network $Network$ --params $MODEL_FILE$ --image $YOUR_IMAGE$` to get single image inference.
-For example `python3 demo.py --dataset voc --network vgg16 --params vgg16_voc0712.params --image myimage.jpg`, add `--gpu 0` to use GPU optionally.
+For example `python3 demo.py --dataset voc --network vgg16 --params vgg16_voc0712.params --image myimage.jpg`, add `--gpu 0` to use GPU, not set to use CPU. 
 Different network has different configuration. Different dataset has different object class names. You must pass them explicitly as command line arguments.
 
 | Network | Dataset | Imageset | Reference | Result | Link  |
diff --git a/example/rcnn/demo.py b/example/rcnn/demo.py
index 2315bb8af366..b0a4ddbeab49 100644
--- a/example/rcnn/demo.py
+++ b/example/rcnn/demo.py
@@ -92,7 +92,7 @@ def parse_args():
     parser.add_argument('--params', type=str, default='', help='path to trained model')
     parser.add_argument('--dataset', type=str, default='voc', help='training dataset')
     parser.add_argument('--image', type=str, default='', help='path to test image')
-    parser.add_argument('--gpu', type=str, default='', help='gpu device eg. 0')
+    parser.add_argument('--gpu', type=str, default='', help='GPU devices, eg."0,1,2,3" , not set to use CPU.')
     parser.add_argument('--vis', action='store_true', help='display results')
     parser.add_argument('--vis-thresh', type=float, default=0.7, help='threshold display boxes')
     # faster rcnn params
diff --git a/example/rcnn/test.py b/example/rcnn/test.py
index 3c047d222016..e964c9080667 100644
--- a/example/rcnn/test.py
+++ b/example/rcnn/test.py
@@ -35,7 +35,7 @@ def test_net(sym, imdb, args):
     logger.info('called with args\n{}'.format(pprint.pformat(vars(args))))
 
     # setup context
-    ctx = mx.gpu(args.gpu)
+    ctx = mx.cpu() if not args.gpu else mx.gpu(args.gpu)
 
     # load testing data
     test_data = TestLoader(imdb.roidb, batch_size=1, short=args.img_short_side, max_size=args.img_long_side,
@@ -94,7 +94,7 @@ def parse_args():
     parser.add_argument('--params', type=str, default='', help='path to trained model')
     parser.add_argument('--dataset', type=str, default='voc', help='training dataset')
     parser.add_argument('--imageset', type=str, default='', help='imageset splits')
-    parser.add_argument('--gpu', type=int, default=0, help='gpu device eg. 0')
+    parser.add_argument('--gpu', type=int, default=0, help='0 to use GPU, not set to use CPU')
     # faster rcnn params
     parser.add_argument('--img-short-side', type=int, default=600)
     parser.add_argument('--img-long-side', type=int, default=1000)
diff --git a/example/rcnn/train.py b/example/rcnn/train.py
index 0739069afb4a..7b1f2f7f31a5 100644
--- a/example/rcnn/train.py
+++ b/example/rcnn/train.py
@@ -33,7 +33,7 @@ def train_net(sym, roidb, args):
     logger.info('called with args\n{}'.format(pprint.pformat(vars(args))))
 
     # setup multi-gpu
-    ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
+    ctx = [mx.cpu()] if not args.gpus else [mx.gpu(int(i)) for i in args.gpus.split(',')]
     batch_size = args.rcnn_batch_size * len(ctx)
 
     # load training data
@@ -127,7 +127,7 @@ def parse_args():
     parser.add_argument('--pretrained', type=str, default='', help='path to pretrained model')
     parser.add_argument('--dataset', type=str, default='voc', help='training dataset')
     parser.add_argument('--imageset', type=str, default='', help='imageset splits')
-    parser.add_argument('--gpus', type=str, default='0', help='gpu devices eg. 0,1')
+    parser.add_argument('--gpus', type=str, help='GPU devices, eg: "0,1,2,3" , not set to use CPU')
     parser.add_argument('--epochs', type=int, default=10, help='training epochs')
     parser.add_argument('--lr', type=float, default=0.001, help='base learning rate')
     parser.add_argument('--lr-decay-epoch', type=str, default='7', help='epoch to decay lr')