diff --git a/make/config.mk b/make/config.mk
index 223e4edc056c..484f60066a72 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -118,4 +118,4 @@ TORCH_PATH = $(HOME)/torch
 # whether to use sframe integration. This requires build sframe
 # git@github.com:dato-code/SFrame.git
 # SFRAME_PATH = $(HOME)/SFrame
-# MXNET_PLUGINS += plugin/sframe/SFrame.mk
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/plugin/sframe/SFrame.mk b/plugin/sframe/plugin.mk
similarity index 100%
rename from plugin/sframe/SFrame.mk
rename to plugin/sframe/plugin.mk
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 39a2c584be96..7458fd29e83b 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -80,7 +80,6 @@ def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names,
     for idx in range(len(param_arrays)):
         param_on_devs = param_arrays[idx]
         kvstore.init(idx, arg_params[param_names[idx]])
-
         if update_on_kvstore:
             kvstore.pull(idx, param_on_devs, priority=-idx)
 
@@ -202,7 +201,6 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
 
     if update_on_kvstore:
         kvstore.set_optimizer(optimizer)
-
     # Now start training
     for epoch in range(begin_epoch, end_epoch):
         # Training phase
@@ -416,6 +414,9 @@ def __init__(self, symbol, ctx=None,
             ctx = [cpu()]
         elif isinstance(ctx, Context):
             ctx = [ctx]
+        # disable multi-cpu data parallelism because blas will use all cpu resource
+        if ctx[0].device_type == "cpu" and len(ctx) > 1:
+            ctx = [cpu()]
         self.ctx = ctx
         # training parameters
         self.num_epoch = num_epoch
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index fa7dc0474ed5..ce209c2ed9e7 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -88,7 +88,7 @@ struct elu {
 struct elu_grad {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType x, DType a) {
-    return DType(x > 0.0f ? 1.0f : a * expf(x));
+    return DType(x > 0.0f ? 1.0f : a + x);
   }
 };