diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index f837654d8a89..469d9c01d095 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -66,10 +66,10 @@ enum RowSparseAuxType {kIdx};
 }
 
 enum NDArrayStorageType {
-  kUndefinedStorage,  // undefined storage
-  kDefaultStorage,    // dense
-  kRowSparseStorage,  // row sparse
-  kCSRStorage,        // csr
+  kUndefinedStorage = -1,  // undefined storage
+  kDefaultStorage,         // dense
+  kRowSparseStorage,       // row sparse
+  kCSRStorage,             // csr
 };
 
 /*!
@@ -120,9 +120,13 @@ class NDArray {
       : shape_(shape), offset_(0), dtype_(dtype), entry_({nullptr, 0, 0}) {
       // Assign default aux types if not given
       if (aux_types.size() == 0) {
-        if (storage_type == kRowSparseStorage) aux_types = {ROW_SPARSE_IDX_TYPE};
-        else if (storage_type == kCSRStorage) aux_types = {CSR_IND_PTR_TYPE, CSR_IDX_DTYPE};
-        else LOG(FATAL) << "Unknown storage type";
+        if (storage_type == kRowSparseStorage) {
+          aux_types = {ROW_SPARSE_IDX_TYPE};
+        } else if (storage_type == kCSRStorage) {
+          aux_types = {CSR_IND_PTR_TYPE, CSR_IDX_DTYPE};
+        } else {
+          LOG(FATAL) << "Unknown storage type";
+        }
       }
       ptr_ = std::make_shared<Chunk>(ctx, delay_alloc, aux_types, storage_type);
 #if MKL_EXPERIMENTAL == 1
@@ -631,7 +635,8 @@ class NDArray {
         delay_alloc = false;
       }
     }
-    inline void CheckAndAlloc(const TShape &shape, const std::vector<TShape> &aux_shapes, int dtype) {
+    inline void CheckAndAlloc(const TShape &shape, const std::vector<TShape> &aux_shapes,
+                              int dtype) {
       // calculate size, perform allocation
       if (delay_alloc) {
         CHECK_EQ(storage_type, kRowSparseStorage) << "Not yet implemented";
diff --git a/nnvm b/nnvm
index 91b5fa6db5db..6f07a314d49c 160000
--- a/nnvm
+++ b/nnvm
@@ -1 +1 @@
-Subproject commit 91b5fa6db5db4af4405aaa37625549d9db399491
+Subproject commit 6f07a314d49cc92aafa89f2d8a21cb65f3f004c5
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 4d1abc91bf09..4ee7ade22998 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -11,7 +11,7 @@
 from .base import mx_uint, NDArrayHandle, ExecutorHandle
 from .base import check_call, c_array, py_str
 from .ndarray import NDArray
-from .sparse_ndarray import SparseNDArray
+from .sparse_ndarray import SparseNDArray, _STORAGE_TYPE_STR_TO_ID
 from . import ndarray as nd
 
 # those functions are not used here, we just import them to keep backward compatibility
@@ -88,8 +88,10 @@ def _get_outputs(self):
             storage_type = ctypes.c_int(0)
             check_call(_LIB.MXNDArrayGetStorageType(ctypes.cast(handles[i], NDArrayHandle),
                                                     ctypes.byref(storage_type)))
-            output = NDArray(NDArrayHandle(handles[i])) if storage_type.value == 1 \
-                     else SparseNDArray(NDArrayHandle(handles[i]))
+            assert(storage_type != _STORAGE_TYPE_STR_TO_ID['undefined'])
+            output = NDArray(NDArrayHandle(handles[i])) \
+                if storage_type.value == _STORAGE_TYPE_STR_TO_ID['default'] \
+                else  SparseNDArray(NDArrayHandle(handles[i]))
             outputs.append(output)
         return outputs
 
diff --git a/python/mxnet/sparse_ndarray.py b/python/mxnet/sparse_ndarray.py
index 5af5fe8de539..1ac50c004da1 100644
--- a/python/mxnet/sparse_ndarray.py
+++ b/python/mxnet/sparse_ndarray.py
@@ -43,17 +43,17 @@
 
 # pylint: enable= no-member
 _STORAGE_TYPE_ID_TO_STR = {
-    0 : 'undefined',
-    1 : 'default',
-    2 : 'row_sparse',
-    3 : 'csr',
+    -1 : 'undefined',
+    0  : 'default',
+    1  : 'row_sparse',
+    2  : 'csr',
 }
 
 _STORAGE_TYPE_STR_TO_ID = {
-    'undefined' : 0,
-    'default' : 1,
-    'row_sparse' : 2,
-    'csr' : 3,
+    'undefined'  : -1,
+    'default'    : 0,
+    'row_sparse' : 1,
+    'csr'        : 2,
 }
 
 _STORAGE_AUX_TYPES = {
@@ -244,7 +244,7 @@ def array(values, index_list, storage_type, shape, ctx=None, dtype=mx_real_t, au
     # TODO also specify auxtypes
     assert(storage_type == 'row_sparse')
     if not isinstance(values, NDArray):
-       values = ndarray.array(values)
+        values = ndarray.array(values)
     for i, index in enumerate(index_list):
        if not isinstance(index, NDArray):
            index_list[i] = ndarray.array(index)
@@ -298,8 +298,8 @@ def zeros(shape, storage_type, ctx=None, dtype=mx_real_t, aux_types=None):
     # pylint: enable= no-member, protected-access
 
 _STORAGE_TYPE_TO_ND_CLASS = {
-    1 : ndarray.NDArray,
-    2 : SparseNDArray,
-    3 : SparseNDArray
+    _STORAGE_TYPE_STR_TO_ID['default']  : ndarray.NDArray,
+    _STORAGE_TYPE_STR_TO_ID['row_sparse'] : SparseNDArray,
+    _STORAGE_TYPE_STR_TO_ID['csr']        : SparseNDArray,
 }
 _init_ndarray_module(_STORAGE_TYPE_TO_ND_CLASS, "mxnet")
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index efc832d1e514..ee8c9339e92b 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -508,7 +508,7 @@ def list_auxiliary_states(self):
     def infer_storage_type(self, *args, **kwargs):
         # TODO(haibin) refactor with dtype
         # FIXME update doc
-        """Infer the chunk type of outputs and arguments of given known types of arguments.
+        """Infer the storage type of outputs and arguments of given known types of arguments.
 
         User can either pass in the known types in positional way or keyword argument way.
         Tuple of Nones is returned if there is not enough information passed in.
@@ -548,8 +548,7 @@ def infer_storage_type(self, *args, **kwargs):
                         raise TypeError('Argument need to be one of '+str(_STORAGE_TYPE_STR_TO_ID))
                     sdata.append(_STORAGE_TYPE_STR_TO_ID[s])
                 else:
-                    #FIXME -1 or 0 for unknown / error chunk type?
-                    sdata.append(-1)
+                    sdata.append(_STORAGE_TYPE_STR_TO_ID['undefined'])
         else:
             keys = []
             for k, v in kwargs.items():
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 646ef05c2a1f..72a529e32ac4 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -131,7 +131,7 @@ int MXNDArrayCreate(const mx_uint *shape,
   API_END();
 }
 
-// TODO remove this API
+// TODO(haibin) remove this API
 int MXNDArrayCreateSparse(NDArrayHandle data,
                     mx_uint num_aux,
                     NDArrayHandle *aux_vec,
@@ -357,7 +357,7 @@ int MXNDArrayGetStorageType(NDArrayHandle handle,
   if (!arr->is_none()) {
     *out_storage_type = arr->storage_type();
   } else {
-    *out_storage_type = -1;
+    *out_storage_type = kUndefinedStorage;
   }
   API_END();
 }
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index da2953c4a110..66e3d916e829 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -513,7 +513,8 @@ int MXSymbolInferStorageType(SymbolHandle sym,
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
   nnvm::Graph g = Symbol2Graph(*s);
-  nnvm::StorageTypeVector arg_storage_types(g.indexed_graph().input_nodes().size());
+  nnvm::StorageTypeVector arg_storage_types(g.indexed_graph().input_nodes().size(),
+                                            kUndefinedStorage);
   if (keys == nullptr && num_args != 0) {
     std::vector<uint32_t> read_only_args = mxnet::ReadOnlyArgIndices(g.indexed_graph());
     CHECK_LE(num_args, read_only_args.size());
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index bd0f2f35f7e3..7c3ef00b2342 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -435,12 +435,12 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
   const auto& vstorage_type = g.GetAttr<nnvm::StorageTypeVector>("storage_type");
 
   // dispatch on a per op basis
-  nnvm::StorageTypeVector dispatch_stypes(idx.num_nodes());
+  nnvm::StorageTypeVector dispatch_stypes(idx.num_nodes(), kUndefinedStorage);
   for (size_t nid = 0; nid < idx.num_nodes(); nid++) {
       const auto& inode = idx[nid];
       auto num_outputs = inode.source->num_outputs();
       auto num_inputs = inode.inputs.size();
-      nnvm::StorageTypeVector vs(num_inputs + num_outputs);
+      nnvm::StorageTypeVector vs(num_inputs + num_outputs, kUndefinedStorage);
       for (size_t i = 0; i < num_inputs; i++) {
         auto e = inode.inputs[i];
         vs[i] = vstorage_type[idx.entry_id(e)];
@@ -492,11 +492,11 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
   CHECK_EQ(idx.num_node_entries(), vstorage.size());
   CHECK_EQ(data_entry_.size(), vshape.size());
   std::vector<Context> data_context(idx.num_node_entries());
-  std::vector<NDArrayStorageType> data_storage_type(idx.num_node_entries());
+  std::vector<NDArrayStorageType> data_storage_type(idx.num_node_entries(), kUndefinedStorage);
   for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
     for (uint32_t i = 0; i < idx[nid].source->num_outputs(); ++i) {
       data_context[idx.entry_id(nid, i)] = vctx[nid];
-      CHECK_NE(vstorage_type[nid], -1);
+      CHECK_NE(vstorage_type[nid], kUndefinedStorage);
       data_storage_type[idx.entry_id(nid, i)] = (NDArrayStorageType) vstorage_type[nid];
     }
   }
@@ -863,7 +863,9 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
 #else
       bool profiling = false;
 #endif
-      // LOG(INFO) << "Running " << nid;
+#if EXECUTOR_DEBUG
+      LOG(INFO) << "Running node " << nid << " - " << seg_op.topo_end - 1;
+#endif
       Engine::Get()->Push(seg_op.opr, seg_op.ctx, 0, profiling);
       nid = seg_op.topo_end - 1;
       continue;
@@ -874,7 +876,9 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
     OpNode& opnode = op_nodes_[nid];
     if (op_nodes_[nid].skip_exec_node) continue;
     opnode.exec->op_ctx.is_train = is_train;
-    // LOG(INFO) << "Running " << nid;
+#if EXECUTOR_DEBUG
+      LOG(INFO) << "Running node " << nid;
+#endif
     if (opnode.exec->exec_type() == Operator::kCrossDeviceCopy) {
       CHECK_EQ(inode.inputs.size(), 1U);
       CHECK_EQ(opnode.exec->in_array.size(), 1U);
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 55e1a84daeda..2b191b7759b6 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -237,7 +237,7 @@ void CopyFromTo(const NDArray &from, NDArray *to, int priority, bool alloc_outpu
     // skip to copy to itself
     return;
   }
-  CHECK(from.storage_type() == to->storage_type()) << "Copying with different storage type";
+  CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type";
   CHECK(from.shape() == to->shape())
       << "operands shape mismatch"
       << "from.shape = " << from.shape() << " to.shape=" << to->shape();
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index 8969bb768823..43acff015e51 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -114,7 +114,7 @@ inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs,
                          std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in)) << " in operator " << attrs.name;
   CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out)) << " in operator " << attrs.name;
-  return ElemwiseStorageAttr<int, storage_type_is_none, storage_type_assign, false, true>(
+  return ElemwiseStorageAttr<int, type_is_none, type_assign, false, true>(
     attrs, in_attrs, out_attrs);
 }
 
@@ -123,7 +123,7 @@ inline bool IdentityAttrLikeRhsStorageType(const nnvm::NodeAttrs& attrs,
                          std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), static_cast<size_t>(2)) << " in operator " << attrs.name;
   CHECK_EQ(out_attrs->size(), static_cast<size_t>(1)) << " in operator " << attrs.name;
-  return ElemwiseStorageAttr<int, storage_type_is_none, storage_type_assign, false, false>(
+  return ElemwiseStorageAttr<int, type_is_none, type_assign, false, false>(
     attrs, in_attrs, out_attrs);
 }
 
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index ca96eeec8eea..708bea0a342f 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -80,10 +80,6 @@ inline bool type_is_none(const int& x) {
   return x == -1;
 }
 
-/*! \brief check if storage type is none (-1) */
-inline bool storage_type_is_none(const int& x) {
-  return x == kUndefinedStorage;
-}
 /*!
  * \brief Assign x to y. Checks for compatiblity when y is not empty.
  *  Allow missing dim in both x and y (as 0).
@@ -125,21 +121,6 @@ inline bool type_assign(int *y, const int& x) {
   return true;
 }
 
-/*!
- * \brief Assign x to y. Checks for compatiblity when y is not -1.
- * \param y target type.
- * \param x source type.
- * \return whether x and y are compatible.
- */
-inline bool storage_type_assign(int *y, const int& x) {
-  if (*y == kUndefinedStorage) {
-    *y = x;
-    return true;
-  } else if (*y != x && x != kUndefinedStorage) {
-    return false;
-  }
-  return true;
-}
 /*!
  * \brief macro assign shape to out if out is unknown otherwise check consistency
  *  Use macro so we can see the error file more clearly
@@ -306,7 +287,6 @@ inline void ParamParser(nnvm::NodeAttrs* attrs) {
   attrs->parsed = std::move(param);
 }
 
-// TODO move to op_util.h
 template <typename xpu>
 void FComputeExFallback(const nnvm::NodeAttrs& attrs,
                         const OpContext& ctx,
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index 7f60cb455c21..164c3de2b4ac 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -63,7 +63,6 @@ NNVM_REGISTER_OP(_backward_mul)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 1}};
   })
-//.set_attr<nnvm::FInferStorageType>("FInferStorageType", ElemwiseSameStorageType<1, 2>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBackwardUseIn<cpu, mshadow_op::right,
                                                               mshadow_op::left>);
 
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index be2d086702b5..6aecf17be6e7 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -206,7 +206,8 @@ void CastStorageComputeEx(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 1);
   CHECK_EQ(outputs.size(), 1);
   auto stype = inputs[0].storage_type();
-  if (stype == kRowSparseStorage) {
+  auto out_stype = outputs[0].storage_type();
+  if (stype == kRowSparseStorage && out_stype == kDefaultStorage) {
     CastStorageComputeRspDns<xpu>(attrs, ctx, inputs, req, outputs);
   } else {
     LOG(FATAL) << "Not implemented";
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 0ce4d3aa0b7a..9be4b550d7d3 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -119,4 +119,3 @@ def test_cast_storage():
     test_elemwise_add_dense_sparse()
     test_elemwise_add_sparse_sparse()
     test_elemwise_add_multiple_stages()
-    test_cast_storage()