diff --git a/Makefile b/Makefile
index a0bd046fbb7a..ea71cd3fff86 100644
--- a/Makefile
+++ b/Makefile
@@ -98,7 +98,7 @@ OBJ = $(patsubst src/%.cc, build/%.o, $(SRC))
 CUSRC = $(wildcard src/*/*.cu)
 CUOBJ = $(patsubst src/%.cu, build/%_gpu.o, $(CUSRC))
 
-ifneq ($(EXTRA_OPERATORS), NONE)
+ifneq ($(EXTRA_OPERATORS),)
 	EXTRA_SRC = $(wildcard $(EXTRA_OPERATORS)/*.cc $(EXTRA_OPERATORS)/*/*.cc)
 	EXTRA_OBJ = $(patsubst $(EXTRA_OPERATORS)/%.cc, $(EXTRA_OPERATORS)/build/%.o, $(EXTRA_SRC))
 	EXTRA_CUSRC = $(wildcard $(EXTRA_OPERATORS)/*.cu $(EXTRA_OPERATORS)/*/*.cu)
@@ -114,9 +114,15 @@ LIB_DEP += $(DMLC_CORE)/libdmlc.a
 ALL_DEP = $(OBJ) $(EXTRA_OBJ) $(LIB_DEP)
 ifeq ($(USE_CUDA), 1)
 	ALL_DEP += $(CUOBJ) $(EXTRA_CUOBJ)
-	LDFLAGS += -lnvrtc -lcuda
+	LDFLAGS += -lcuda
 endif
 
+ifeq ($(USE_NVRTC), 1)
+	LDFLAGS += -lnvrtc
+	CFLAGS += -DMXNET_USE_NVRTC=1
+else
+	CFLAGS += -DMXNET_USE_NVRTC=0
+endif
 
 
 build/%.o: src/%.cc
@@ -201,3 +207,6 @@ clean_all: clean
 
 -include build/*.d
 -include build/*/*.d
+ifneq ($(EXTRA_OPERATORS),)
+	-include $(EXTRA_OPERATORS)/build/*.d
+endif
diff --git a/include/mxnet/mxrtc.h b/include/mxnet/mxrtc.h
index e0418110277c..de8c385549bb 100644
--- a/include/mxnet/mxrtc.h
+++ b/include/mxnet/mxrtc.h
@@ -7,8 +7,7 @@
 #ifndef MXNET_MXRTC_H_
 #define MXNET_MXRTC_H_
 #include "./base.h"
-#if MXNET_USE_CUDA
-
+#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
 #include <nvrtc.h>
 #include <cuda.h>
 
@@ -88,5 +87,5 @@ class MXRtc {
 
 }  // namespace mxnet
 
-#endif  // MXNET_USE_CUDA
+#endif  // MXNET_USE_CUDA && MXNET_USE_NVRTC
 #endif  // MXNET_MXRTC_H_
diff --git a/make/config.mk b/make/config.mk
index 6585e5299f5e..8e9f8af3a5da 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -48,6 +48,9 @@ USE_CUDA_PATH = NONE
 # whether use CUDNN R3 library
 USE_CUDNN = 0
 
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 0
+
 # whether use opencv during compilation
 # you can disable it, however, you will not able to use
 # imbin iterator
diff --git a/make/osx.mk b/make/osx.mk
index 13a6389bba04..23c2c7a363e5 100644
--- a/make/osx.mk
+++ b/make/osx.mk
@@ -48,6 +48,9 @@ USE_CUDA_PATH = NONE
 # whether use CUDNN R3 library
 USE_CUDNN = 0
 
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 0
+
 # whether use opencv during compilation
 # you can disable it, however, you will not able to use
 # imbin iterator
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index cddb6a9a07fa..ef371b190808 100644
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -4,8 +4,10 @@
 
 import numpy as np
 from .base import string_types
-from .ndarray import NDArray
+from .ndarray import NDArray, load
 from . import random
+import logging
+import re
 
 class Initializer(object):
     """Base class for Initializer."""
@@ -75,6 +77,71 @@ def _init_default(self, name, _):
         raise ValueError('Unknown initialization pattern for %s' % name)
     # pylint: enable=no-self-use, missing-docstring, invalid-name
 
+class Load(object):
+    """Initialize by loading pretrained param from file or dict
+
+    Parameters
+    ----------
+    param: str or dict of str->NDArray
+        param file or dict mapping name to NDArray.
+    default_init: Initializer
+        default initializer when name is not found in param.
+    verbose: bool
+        log source when initializing.
+    """
+    def __init__(self, param, default_init=None, verbose=False):
+        if isinstance(param, str):
+            param = load(param)
+        assert isinstance(param, dict)
+        self.param = {}
+        for name, arr in param.items():
+            if name.startswith('arg:'):
+                self.param[name[4:]] = arr
+            else:
+                self.param[name] = arr
+        self.default_init = default_init
+        self.verbose = verbose
+
+    def __call__(self, name, arr):
+        if self.param.has_key(name):
+            assert arr.shape == self.param[name].shape, \
+                'Parameter %s cannot be initialized from loading. '%name + \
+                'Shape mismatch, target %s vs loaded %s'%(str(arr.shape),
+                                                          self.param[name].shape)
+            arr[:] = self.param[name]
+            if self.verbose:
+                logging.info('Initialized %s by loading', name)
+        else:
+            assert self.default_init is not None, \
+                "Cannot Initialize %s. Not found in loaded param " + \
+                "and no default Initializer is provided."
+            self.default_init(name, arr)
+            if self.verbose:
+                logging.info('Initialized %s by default', name)
+
+class Mixed(object):
+    """Initialize with mixed Initializer
+
+    Parameters
+    ----------
+    patterns: list of str
+        list of regular expression patterns to match parameter names.
+    initializers: list of Initializer
+        list of Initializer corrosponding to patterns
+    """
+    def __init__(self, patterns, initializers):
+        assert len(patterns) == len(initializers)
+        self.map = zip([re.compile(p) for p in patterns], initializers)
+
+    def __call__(self, name, arr):
+        for prog, init in self.map:
+            if prog.match(name):
+                init(name, arr)
+                return
+        raise ValueError('Parameter name %s did not match any pattern. Consider' +
+                         'add a ".*" pattern at the and with default Initializer.')
+
+
 class Uniform(Initializer):
     """Initialize the weight with uniform [-scale, scale]
 
diff --git a/python/mxnet/monitor.py b/python/mxnet/monitor.py
index 4f1e236fb212..6fb84cc3ee55 100644
--- a/python/mxnet/monitor.py
+++ b/python/mxnet/monitor.py
@@ -7,6 +7,7 @@
 from . import ndarray
 import logging
 from math import sqrt
+import re
 
 
 class Monitor(object):
@@ -20,8 +21,13 @@ class Monitor(object):
         a function that computes statistics of tensors.
         Takes a NDArray and returns a NDArray. defaults to mean
         absolute value |x|/size(x).
+    pattern : str
+        A regular expression specifying which tensors to monitor.
+        Only tensors with names that match name_pattern will be included.
+        For example, '.*weight|.*output' will print all weights and outputs;
+        '.*backward.*' will print all gradients.
     """
-    def __init__(self, interval, stat_func=None):
+    def __init__(self, interval, stat_func=None, pattern='.*', sort=False):
         if stat_func is None:
             def asum_stat(x):
                 """returns |x|/size(x), async execution."""
@@ -33,9 +39,11 @@ def asum_stat(x):
         self.queue = []
         self.step = 0
         self.exes = []
+        self.re_prog = re.compile(pattern)
+        self.sort = sort
         def stat_helper(name, array):
             """wrapper for executor callback"""
-            if not self.activated:
+            if not self.activated or not self.re_prog.match(name):
                 return
             array = ctypes.cast(array, NDArrayHandle)
             array = NDArray(array, writable=False)
@@ -73,23 +81,31 @@ def toc(self):
         Returns
         -------
         res : list of """
-        if self.activated:
-            for exe in self.exes:
-                for array in exe.arg_arrays:
-                    array.wait_to_read()
-            for exe in self.exes:
-                for name, array in zip(exe._symbol.list_arguments(), exe.arg_arrays):
-                    self.queue.append((self.step, name, self.stat_func(array)))
-        else:
+        if not self.activated:
             return []
+        for exe in self.exes:
+            for array in exe.arg_arrays:
+                array.wait_to_read()
+        for exe in self.exes:
+            for name, array in zip(exe._symbol.list_arguments(), exe.arg_arrays):
+                if self.re_prog.match(name):
+                    self.queue.append((self.step, name, self.stat_func(array)))
         self.activated = False
         res = []
-        for n, k, v in self.queue:
-            assert isinstance(v, NDArray)
-            if v.shape == (1,):
-                res.append((n, k, str(v.asscalar())))
-            else:
-                res.append((n, k, str(v.asnumpy())))
+        if self.sort:
+            self.queue.sort(key=lambda x: x[1])
+        for n, k, v_list in self.queue:
+            if isinstance(v_list, NDArray):
+                v_list = [v_list]
+            assert isinstance(v_list, list)
+            s = ''
+            for v in v_list:
+                assert isinstance(v, NDArray)
+                if v.shape == (1,):
+                    s += str(v.asscalar()) + '\t'
+                else:
+                    s += str(v.asnumpy()) + '\t'
+            res.append((n, k, s))
         self.queue = []
         return res
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 2c913d85ddf5..3deea52f9e9d 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1154,7 +1154,7 @@ int MXRtcCreate(char* name, mx_uint num_input, mx_uint num_output,
                 NDArrayHandle* inputs, NDArrayHandle* outputs,
                 char* kernel, RtcHandle *out) {
   API_BEGIN();
-#if MXNET_USE_CUDA
+#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
   std::vector<std::pair<std::string, NDArray> > input, output;
   for (mx_uint i = 0; i < num_input; ++i) {
     input.push_back(std::pair<std::string, NDArray>(input_names[i],
@@ -1167,8 +1167,8 @@ int MXRtcCreate(char* name, mx_uint num_input, mx_uint num_output,
   MXRtc *rtc = new MXRtc(name, input, output, kernel);
   *out = reinterpret_cast<RtcHandle>(rtc);
 #else
-  LOG(FATAL) << "Need to compile with USE_CUDA=1 for MXRtc.";
-#endif  // MXNET_USE_CUDA
+  LOG(FATAL) << "Need to compile with USE_CUDA=1 and USE_NVRTC=1 for MXRtc.";
+#endif  // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
   API_END();
 }
 
@@ -1181,7 +1181,7 @@ int MXRtcPush(RtcHandle handle, mx_uint num_input, mx_uint num_output,
               mx_uint blockDimY,
               mx_uint blockDimZ) {
   API_BEGIN();
-#if MXNET_USE_CUDA
+#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
   std::vector<NDArray> input, output;
   for (mx_uint i = 0; i < num_input; ++i) {
     input.push_back(*reinterpret_cast<NDArray*>(inputs[i]));
@@ -1197,18 +1197,18 @@ int MXRtcPush(RtcHandle handle, mx_uint num_input, mx_uint num_output,
                                          blockDimY,
                                          blockDimZ);
 #else
-  LOG(FATAL) << "Need to compile with USE_CUDA=1 for MXRtc.";
-#endif  // MXNET_USE_CUDA
+  LOG(FATAL) << "Need to compile with USE_CUDA=1 and USE_NVRTC=1 for MXRtc.";
+#endif  // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
   API_END();
 }
 
 int MXRtcFree(RtcHandle handle) {
   API_BEGIN();
-#if MXNET_USE_CUDA
+#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
   delete reinterpret_cast<MXRtc*>(handle);
 #else
-  LOG(FATAL) << "Need to compile with USE_CUDA=1 for MXRtc.";
-#endif  // MXNET_USE_CUDA
+  LOG(FATAL) << "Need to compile with USE_CUDA=1 and USE_NVRTC=1 for MXRtc.";
+#endif  // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
   API_END();
 }
 
diff --git a/src/common/mxrtc.cc b/src/common/mxrtc.cc
index c23e5eacc94f..4fd687267409 100644
--- a/src/common/mxrtc.cc
+++ b/src/common/mxrtc.cc
@@ -5,10 +5,8 @@
  * \author Junyuan Xie
  */
 #include <mxnet/mxrtc.h>
-#if MXNET_USE_CUDA
-
+#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
 namespace mxnet {
-
 const std::string MXRtc::str_type = "float";
 std::unordered_map<std::string, char*> MXRtc::kernel_registry;
 
@@ -139,4 +137,4 @@ char* MXRtc::compile(const std::string& name, const std::string& code) {
 
 }  // namespace mxnet
 
-#endif  // MXNET_USE_CUDA
+#endif  // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
diff --git a/src/operator/upsampling-inl.h b/src/operator/upsampling-inl.h
index 0ac021627a04..513258f0cc61 100644
--- a/src/operator/upsampling-inl.h
+++ b/src/operator/upsampling-inl.h
@@ -24,6 +24,7 @@ namespace up_enum {
 enum UpSamplingOpInputs {kData, kWeight};
 enum UpSamplingOpOutputs {kOut};
 enum UpSamplingType {kNearest, kBilinear};
+enum UpSamplingMultiInputMode {kConcat, kSum};
 }  // namespace up_enum
 
 struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
@@ -31,6 +32,7 @@ struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
   index_t num_filter;
   int sample_type;
   int num_args;
+  int multi_input_mode;
   DMLC_DECLARE_PARAMETER(UpSamplingParam) {
     DMLC_DECLARE_FIELD(scale)
     .set_range(1, 1000)
@@ -42,6 +44,13 @@ struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
     .add_enum("nearest", up_enum::kNearest)
     .add_enum("bilinear", up_enum::kBilinear)
     .describe("upsampling method");
+    DMLC_DECLARE_FIELD(multi_input_mode)
+    .add_enum("concat", up_enum::kConcat)
+    .add_enum("sum", up_enum::kSum)
+    .set_default(up_enum::kConcat)
+    .describe("How to handle multiple input. concat means concatenate upsampled "
+    "images along the channel dimension. sum means add all images together, "
+    "only available for nearest neighbor upsampling.");
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
     .describe("Number of inputs to be upsampled. For nearest neighbor "
     "upsampling, this can be 1-N; the size of output will be"
@@ -66,6 +75,9 @@ class UpSamplingNearestOp : public Operator {
     using namespace mshadow::expr;
     CHECK_EQ(in_data.size(), param_.num_args);
     CHECK_EQ(out_data.size(), 1);
+    if (req[up_enum::kOut] == kNullOp) {
+      return;
+    }
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> out = out_data[up_enum::kOut].get<xpu, 4, real_t>(s);
     if (param_.num_args > 1) {
@@ -74,7 +86,15 @@ class UpSamplingNearestOp : public Operator {
         Tensor<xpu, 4> data = in_data[i].get<xpu, 4, real_t>(s);
         int end = begin + data.size(1);
         int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2);
-        Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale));
+        if (param_.multi_input_mode == up_enum::kSum) {
+          if (i == 0) {
+            Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale));
+          } else {
+            out += upsampling_nearest(data, scale);
+          }
+        } else {
+          Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale));
+        }
         begin = end;
       }
     } else {
@@ -103,12 +123,21 @@ class UpSamplingNearestOp : public Operator {
         mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
         int end = begin + input_grad.size(1);
         int scale = grad.size(2)/in_shape[0];
-        Assign(input_grad, req[i],
-               pool<mshadow::red::sum>(slice<1>(grad, begin, end),
-                                       in_shape,
-                                       scale,
-                                       scale,
-                                       scale));
+        if (param_.multi_input_mode == up_enum::kSum) {
+          Assign(input_grad, req[i],
+                 pool<mshadow::red::sum>(grad,
+                                         in_shape,
+                                         scale,
+                                         scale,
+                                         scale));
+        } else {
+          Assign(input_grad, req[i],
+                 pool<mshadow::red::sum>(slice<1>(grad, begin, end),
+                                         in_shape,
+                                         scale,
+                                         scale,
+                                         scale));
+        }
         begin = end;
       }
     } else {
@@ -171,7 +200,13 @@ class UpSamplingProp : public OperatorProperty {
           "does not divide output height of " << oh;
         CHECK_EQ(ow%shape[3], 0) << "UpSamplingNearest: input weight of " << shape[3] << \
           "does not divide output weight of " << ow;
-        oshape[1] += shape[1];
+        if (param_.multi_input_mode == up_enum::kSum) {
+          CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \
+            "Number of channels must be the same when multi_input_mode==sum";
+          oshape[1] = shape[1];
+        } else {
+          oshape[1] += shape[1];
+        }
       }
     } else {
       CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";