From 3317180ad9b8db970728bc4a18c2a7ae7e652000 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Fri, 19 Jun 2015 14:39:52 -0600
Subject: [PATCH 01/12] activation op

---
 src/operator/activation_op-inl.hpp | 50 ++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 src/operator/activation_op-inl.hpp
diff --git a/src/operator/activation_op-inl.hpp b/src/operator/activation_op-inl.hpp
new file mode 100644
index 000000000000..a3d5fe8df1ec
--- /dev/null
+++ b/src/operator/activation_op-inl.hpp
@@ -0,0 +1,50 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file activation_op-inl.hpp
+ * \brief activation operator of mxnet
+ */
+
+#ifndef ACTIVATION_OP_INL_HPP
+#define ACTIVATION_OP_INL_HPP
+#pragma once
+#include <mxnet/operator.h>
+
+namespace mxnet {
+template<typename xpu, typename ForwardOp, typename BackOp>
+class ActivationOp : public Operator {
+public:
+  virtual void InferShape(const std::vector<TShape> &in_shape,
+                          std::vector<TShape> *out_shape) {
+    CHECK(in_shape.size() == 1) << "Activation Op: only 1 input is allowed";
+    out_shape->resize(in_shape.size());
+    out_shape->at(0) = in_shape[0];
+  }
+  virtual void Forward(Option opt,
+                       RunContext ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<TBlob> &out_data) {
+    CHECK(out_data.size() == 1) << "Activation Op: only 1 output data is allowed";
+    CHECK(in_data.size() == 1) << "Activation Op: only 1 input data is allowed";
+    mshadow::Stream<xpu> *stream = static_cast<mshadow::Stream<xpu> *>(ctx.stream);
+    mshadow::Tensor<xpu, 2> in = in_data[0].FlatTo2D(stream);
+    mshadow::Tensor<xpu, 2> out = out_data[0].FlatTo2D(stream);
+    out = mshadow::expr::F<ForwardOp>(in);
+  }
+  virtual void Backward(RunContext ctx,
+                        const std::vector<TBlob> &grad_next,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<GradReqType> req) {
+    CHECK(grad_next.size() == 1) << "Activation Op: only 1 input grad is allowed";
+    CHECK(in_data.size() == 1) << "Activation Op: only 1 input data is allowed";
+    CHECK(req.size() == 1) << "Activation Op: only 1 req is allowed";
+    CHECK(req[0] == kWriteInplace) << "Activation Op: only support inplace mode";
+    mshadow::Stream<xpu> *stream = static_cast<mshadow::Stream<xpu> *>(ctx.stream);
+    mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D(stream);
+    mshadow::Tensor<xpu, 2> data = in_data[0].FlatTo2D(stream);
+    data = mshadow::expr::F<BackOp>(data) * grad;
+  }
+}; // class ActivationOp
+} // namespace cxxnet
+
+#endif // ACTIVATION_OP_INL_HPP

From e4e2178fc6e52087ad351a4488e18df77800c026 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Fri, 19 Jun 2015 14:58:56 -0600
Subject: [PATCH 02/12] chg

---
 src/operator/activation_op-inl.hpp | 39 +++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/operator/activation_op-inl.hpp b/src/operator/activation_op-inl.hpp
index a3d5fe8df1ec..02ad1cd24951 100644
--- a/src/operator/activation_op-inl.hpp
+++ b/src/operator/activation_op-inl.hpp
@@ -8,24 +8,28 @@
 #define ACTIVATION_OP_INL_HPP
 #pragma once
 #include <mxnet/operator.h>
+#include <vector>
 
 namespace mxnet {
 template<typename xpu, typename ForwardOp, typename BackOp>
 class ActivationOp : public Operator {
-public:
+ public:
   virtual void InferShape(const std::vector<TShape> &in_shape,
                           std::vector<TShape> *out_shape) {
     CHECK(in_shape.size() == 1) << "Activation Op: only 1 input is allowed";
-    out_shape->resize(in_shape.size());
-    out_shape->at(0) = in_shape[0];
+    TShape out = in_shape[0];
+    out_shape->push_back(out);
   }
   virtual void Forward(Option opt,
                        RunContext ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<TBlob> &out_data) {
-    CHECK(out_data.size() == 1) << "Activation Op: only 1 output data is allowed";
-    CHECK(in_data.size() == 1) << "Activation Op: only 1 input data is allowed";
-    mshadow::Stream<xpu> *stream = static_cast<mshadow::Stream<xpu> *>(ctx.stream);
+    CHECK(out_data.size() == 1) << \
+                           "Activation Op: only 1 output data is allowed";
+    CHECK(in_data.size() == 1) << \
+                          "Activation Op: only 1 input data is allowed";
+    mshadow::Stream<xpu> *stream = \
+      static_cast<mshadow::Stream<xpu> *>(ctx.stream);
     mshadow::Tensor<xpu, 2> in = in_data[0].FlatTo2D(stream);
     mshadow::Tensor<xpu, 2> out = out_data[0].FlatTo2D(stream);
     out = mshadow::expr::F<ForwardOp>(in);
@@ -35,16 +39,23 @@ class ActivationOp : public Operator {
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<GradReqType> req) {
-    CHECK(grad_next.size() == 1) << "Activation Op: only 1 input grad is allowed";
-    CHECK(in_data.size() == 1) << "Activation Op: only 1 input data is allowed";
-    CHECK(req.size() == 1) << "Activation Op: only 1 req is allowed";
-    CHECK(req[0] == kWriteInplace) << "Activation Op: only support inplace mode";
-    mshadow::Stream<xpu> *stream = static_cast<mshadow::Stream<xpu> *>(ctx.stream);
+    CHECK(grad_next.size() == 1) << \
+                            "Activation Op: only 1 input grad is allowed";
+    CHECK(in_data.size() == 1) << \
+                          "Activation Op: only 1 input data is allowed";
+    CHECK(req.size() == 1) << \
+                      "Activation Op: only 1 req is allowed";
+    CHECK(req[0] == kWriteInplace) << \
+                  "Activation Op: only support inplace mode";
+    mshadow::Stream<xpu> *stream = \
+      static_cast<mshadow::Stream<xpu> *>(ctx.stream);
     mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D(stream);
     mshadow::Tensor<xpu, 2> data = in_data[0].FlatTo2D(stream);
     data = mshadow::expr::F<BackOp>(data) * grad;
   }
-}; // class ActivationOp
-} // namespace cxxnet
+};  // class ActivationOp
+}  // namespace mxnet
+
+#endif  // ACTIVATION_OP_INL_HPP
+
 
-#endif // ACTIVATION_OP_INL_HPP

From 3674aefb2cad80826489f1efa0b83118a8ab1fb3 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Fri, 19 Jun 2015 16:06:19 -0600
Subject: [PATCH 03/12] add op and extra opt

---
 include/mxnet/operator.h           |   7 +-
 src/operator/activation_op-inl.hpp |   9 +--
 src/operator/op.h                  | 109 +++++++++++++++++++++++++++++
 3 files changed, 119 insertions(+), 6 deletions(-)
 create mode 100644 src/operator/op.h

diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index fbe2e2c8f6af..da371dc9b326 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -26,6 +26,8 @@ class Operator {
   struct Option {
     /*! \brief whether it is training phase*/
     int is_train;
+    /*! \brief whether propagate gradient to x in backprop */
+    int prop_grad;
   };
   /*! \briref gradient request type the request can have */
   enum GradReqType {
@@ -43,7 +45,7 @@ class Operator {
    * \param name parameter name
    * \param val string for configuration
    */
-  virtual void SetParam(const char *name, const char *val) {}  
+  virtual void SetParam(const char *name, const char *val) {}
   /*!
    * \brief inter the shape of output given the input data
    * \param in_shape the shape of input arguments of the operator
@@ -73,7 +75,8 @@ class Operator {
    * \param req_types request types of the gradient saving operation
    * \sa GradReqType
    */
-  virtual void Backward(RunContext ctx,
+  virtual void Backward(Option opt,
+                        RunContext ctx,
                         const std::vector<TBlob> &grad_next,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_grad,
diff --git a/src/operator/activation_op-inl.hpp b/src/operator/activation_op-inl.hpp
index 02ad1cd24951..7ac0ddc87a53 100644
--- a/src/operator/activation_op-inl.hpp
+++ b/src/operator/activation_op-inl.hpp
@@ -4,8 +4,8 @@
  * \brief activation operator of mxnet
  */
 
-#ifndef ACTIVATION_OP_INL_HPP
-#define ACTIVATION_OP_INL_HPP
+#ifndef SRC_OPERATOR_ACTIVATION_OP_INL_HPP_
+#define SRC_OPERATOR_ACTIVATION_OP_INL_HPP_
 #pragma once
 #include <mxnet/operator.h>
 #include <vector>
@@ -34,7 +34,8 @@ class ActivationOp : public Operator {
     mshadow::Tensor<xpu, 2> out = out_data[0].FlatTo2D(stream);
     out = mshadow::expr::F<ForwardOp>(in);
   }
-  virtual void Backward(RunContext ctx,
+  virtual void Backward(Option opt,
+                        RunContext ctx,
                         const std::vector<TBlob> &grad_next,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_grad,
@@ -56,6 +57,6 @@ class ActivationOp : public Operator {
 };  // class ActivationOp
 }  // namespace mxnet
 
-#endif  // ACTIVATION_OP_INL_HPP
+#endif  // SRC_OPERATOR_ACTIVATION_OP_INL_HPP_
 
 
diff --git a/src/operator/op.h b/src/operator/op.h
new file mode 100644
index 000000000000..32b848846f70
--- /dev/null
+++ b/src/operator/op.h
@@ -0,0 +1,109 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file op.h
+ * \brief extra mshadow operation for mxnet
+ * \author Bing Xu
+ */
+#ifndef SRC_OPERATOR_OP_H_
+#define SRC_OPERATOR_OP_H_
+#pragma once
+
+#include <algorithm>
+
+namespace mxnet {
+/*! \brief operations for ActivationLayer */
+namespace op {
+struct identity {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return a;
+  }
+};
+struct identity_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 1.0f;
+  }
+};
+
+/*! \brief sigmoid unit */
+struct sigmoid {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 1.0f / (1.0f + expf(-a));
+  }
+};
+struct sigmoid_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return a * (1.0f - a);
+  }
+};
+/*! \brief Rectified Linear Operation */
+struct relu {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return std::max(a, 0.0f);
+  }
+};
+struct relu_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return a > 0.0f ? 1.0f : 0.0f;
+  }
+};
+
+/*! \brief Leaky ReLU Operation */
+struct xelu {
+  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+    return a > 0 ? a : a / b;
+  }
+};
+
+struct xelu_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+    return a > 0 ? 1 : 1.0f / b;
+  }
+};
+
+struct tanh {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return tanhf( a );
+  }
+};
+
+struct tanh_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 1.0f - a * a;
+  }
+};
+
+
+struct square {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return a * a;
+  }
+};
+
+/*! \brief used for generate Bernoulli mask */
+struct threshold {
+  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+    return a < b ? 1.0f : 0.0f;
+  }
+};
+
+/*! \brief used for generate element of power */
+struct power {
+  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+    return powf( a, b );
+  }
+};
+
+/*!\ \brief used for generate element sqrt */
+struct square_root {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return sqrt(a);
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // SRC_OPERATOR_OP_H_
+
+
+

From 062dba5c14ab2ebc34a861748172a6fe7a5a6a7d Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 20 Jun 2015 15:57:57 -0600
Subject: [PATCH 04/12] chg

---
 ...ivation_op-inl.hpp => activation_op-inl.h} |  0
 src/operator/{op.h => mshadow_op.h}           |  0
 src/operator/operator.cc                      | 18 +++++
 src/operator/operator.cu                      | 20 ++++++
 src/operator/operator_helper.h                | 47 ++++++++++++
 src/operator/param.h                          | 71 +++++++++++++++++++
 6 files changed, 156 insertions(+)
 rename src/operator/{activation_op-inl.hpp => activation_op-inl.h} (100%)
 rename src/operator/{op.h => mshadow_op.h} (100%)
 create mode 100644 src/operator/operator.cc
 create mode 100644 src/operator/operator.cu
 create mode 100644 src/operator/operator_helper.h
 create mode 100644 src/operator/param.h

diff --git a/src/operator/activation_op-inl.hpp b/src/operator/activation_op-inl.h
similarity index 100%
rename from src/operator/activation_op-inl.hpp
rename to src/operator/activation_op-inl.h
diff --git a/src/operator/op.h b/src/operator/mshadow_op.h
similarity index 100%
rename from src/operator/op.h
rename to src/operator/mshadow_op.h
diff --git a/src/operator/operator.cc b/src/operator/operator.cc
new file mode 100644
index 000000000000..bee6238d8bce
--- /dev/null
+++ b/src/operator/operator.cc
@@ -0,0 +1,18 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file operator.cc
+ * \brief
+ * \author Bing Xu
+*/
+#include "operator_helper.h"
+
+namespace mxnet {
+namespace op {
+
+Operator * CreateOperator(OpType type) {
+  return OperatorFactory<cpu>(type);
+}
+
+} // namespace op
+} // namespace mxnet
+
diff --git a/src/operator/operator.cu b/src/operator/operator.cu
new file mode 100644
index 000000000000..3cc1ada28e4b
--- /dev/null
+++ b/src/operator/operator.cu
@@ -0,0 +1,20 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file operator.cu
+ * \brief
+ * \author Bing Xu
+*/
+
+
+#include "operator_helper.h"
+
+namespace mxnet {
+namespace op {
+
+Operator * CreateOperator(OpType type) {
+  return OperatorFactory<gpu>(type);
+}
+
+} // namespace op
+} // namespace mxnet
+
diff --git a/src/operator/operator_helper.h b/src/operator/operator_helper.h
new file mode 100644
index 000000000000..b2e2ec0b5050
--- /dev/null
+++ b/src/operator/operator_helper.h
@@ -0,0 +1,47 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file assign_helper.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_HELPER_H_
+#define MXNET_OPERATOR_HELPER_H_
+#include "activation_op-inl.h"
+#include "mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+
+enum OpType {
+  kReLU = 0,
+};
+
+
+template<typename xpu, typename Exp>
+inline void Assign(const Exp &exp,
+            const mshadow::Tensor<xpu,2> &out,
+            const Operator::GradReqType &req) {
+  switch (req) {
+    case Operator::kNullOp:
+      break;
+    case Operator::kWriteTo:
+    case Operator::kWriteInplace:
+      break;
+    case Operator::kAddTo:
+      break;
+  }
+}
+
+template<typename xpu>
+Operator *OperatorFactory(OpType type) {
+  switch (type) {
+    case kReLU:
+      return new ActivationOp<xpu, relu, relu_grad>();
+
+  };
+  return NULL;
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_HELPER_H_
diff --git a/src/operator/param.h b/src/operator/param.h
new file mode 100644
index 000000000000..336c833165f8
--- /dev/null
+++ b/src/operator/param.h
@@ -0,0 +1,71 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file param.h
+ * \brief operator params
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_PARAM_H_
+#define MXNET_OPERATOR_PARAM_H_
+#pragma once
+
+namespace mxnet {
+namespace op {
+struct Param {
+  /*! \brief number of hidden layers */
+  int num_hidden;
+  /*! \brief number of output channel */
+  int num_channel;
+  /*! \brief number of parallel group */
+  int num_group;
+  /*! \brief kernel height */
+  int kernel_y;
+  /*! \brief kernel width */
+  int kernel_x;
+  /*! \brief stride in y dimension*/
+  int stride_y;
+  /*! \brief stride in x dimension */
+  int stride_x;
+  /*! \brief padding in y dimension */
+  int pad_y;
+  /*! \brief padding in x dimension */
+  int pad_x;
+  /*! \brief whether not include bias term */
+  int no_bias;
+  /*! \brief maximum temp_col_size allowed in each layer */
+  int temp_col_max;
+  /*! \brief number of input channels */
+  int num_input_channel;
+  /*! \brief number of input hidden nodes, used by fullc */
+  int num_input_node;
+  /*! \brief reserved fields, for future compatibility */
+  int reserved[64];
+  inline void SetParam(const char *name, const char* val) {
+    if (!strcmp(name, "nhidden")) num_hidden = atoi(val);
+    if (!strcmp(name, "nchannel")) num_channel = atoi(val);
+    if (!strcmp(name, "ngroup")) num_group = atoi(val);
+    if (!strcmp(name, "kernel_size")) {
+      kernel_y = kernel_x = atoi(val);
+    }
+    if (!strcmp(name, "kernel_height")) kernel_height = atoi(val);
+    if (!strcmp(name, "kernel_width")) kernel_width = atoi(val);
+    if (!strcmp(name, "stride")) {
+      stride_y = stride_x = atoi(val);
+    }
+    if (!strcmp(name, "stride_y")) stride_y = atoi(val);
+    if (!strcmp(name, "stride_x")) stride_x = atoi(val);
+
+    if (!strcmp(name, "pad")) {
+      pad_y = pad_x  = atoi(val);
+    }
+    if (!strcmp(name, "pad_y")) pad_y = atoi(val);
+    if (!strcmp(name, "pad_x")) pad_x = atoi(val);
+    if (!strcmp(name, "no_bias")) no_bias = atoi(val);
+    if (!strcmp(name, "temp_col_max")) temp_col_max = atoi(val) << 18;
+  }
+};  // struct Param
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_PARAM_H_
+
+

From 12b1c97cf498926208e79ae61da563b5b5516e25 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 20 Jun 2015 16:00:08 -0600
Subject: [PATCH 05/12] compile

---
 Makefile                        | 11 +++++++----
 include/mxnet/narray.h          | 17 +++++++++--------
 include/mxnet/operator.h        |  2 +-
 src/dag_engine/simple_engine.cc |  7 +++----
 src/narray/narray_op-inl.h      |  2 +-
 src/operator/mshadow_op.h       |  8 ++++----
 6 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/Makefile b/Makefile
index ff8cd7fd4d02..418533577dec 100644
--- a/Makefile
+++ b/Makefile
@@ -40,6 +40,7 @@ endif
 
 ifneq ($(ADD_CFLAGS), NONE)
 	CFLAGS += $(ADD_CFLAGS)
+	CFLAGS += -DDMLC_USE_CXX11=1
 endif
 
 ifneq ($(ADD_LDFLAGS), NONE)
@@ -47,8 +48,8 @@ ifneq ($(ADD_LDFLAGS), NONE)
 endif
 
 OBJ = storage.o narray_op_cpu.o
-OBJCXX11 = engine.o narray.o 
-CUOBJ = narray_op_gpu.o
+OBJCXX11 = engine.o narray.o operator.o
+CUOBJ = narray_op_gpu.o operator_gpu.o
 
 LIB_DEP = $(DMLC_CORE)/libdmlc.a
 
@@ -64,6 +65,8 @@ engine.o: src/dag_engine/simple_engine.cc
 narray.o: src/narray/narray.cc
 narray_op_cpu.o: src/narray/narray_op_cpu.cc src/narray/narray_op-inl.h
 narray_op_gpu.o: src/narray/narray_op_gpu.cu src/narray/narray_op-inl.h
+operator.o: src/operator/operator.cc
+operator_gpu.o: src/operator/operator.cu
 
 $(BIN) :
 	$(CXX) $(CFLAGS)  -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
@@ -72,13 +75,13 @@ $(OBJ) :
 	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
 
 $(OBJCXX11) :
-	$(CXX) -std=c++0x -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+	$(CXX) -std=c++11 -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
 
 $(SLIB) :
 	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
 
 $(CUOBJ) :
-	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^)
+	$(NVCC) --std=c++11 -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^)
 
 $(CUBIN) :
 	$(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^)
diff --git a/include/mxnet/narray.h b/include/mxnet/narray.h
index 8e3a398fd9ad..287d9761a736 100644
--- a/include/mxnet/narray.h
+++ b/include/mxnet/narray.h
@@ -7,6 +7,7 @@
 #define MXNET_NARRAY_H_
 #include <memory>
 #include <dmlc/base.h>
+#include <dmlc/logging.h>
 #include "./base.h"
 #include "./storage.h"
 #include "./tensor_blob.h"
@@ -25,7 +26,7 @@ class NArray {
   /*! \brief default cosntructor */
   NArray() {}
   /*!
-   * \brief constructing a new dynamic NArray 
+   * \brief constructing a new dynamic NArray
    * \param shape the shape of array
    * \param ctx context of NArray
    */
@@ -34,16 +35,16 @@ class NArray {
   }
   /*!
    * \brief constructing a static NArray that shares data with TBlob
-   *  Use with caution: allocate ONLY ONE NArray for each TBlob, 
+   *  Use with caution: allocate ONLY ONE NArray for each TBlob,
    *  make sure the memory region is available through out the life of NArray
    * \param data the memory content of static data
    * \param dev_id the device id this tensor sits at
-   */  
+   */
   NArray(const TBlob &data, int dev_id)
       : ptr_(new Chunk(data, dev_id)) {
   }
   /*!
-   * \return the shape of current NArray    
+   * \return the shape of current NArray
    */
   inline const TShape &shape() const {
     return ptr_->data.shape_;
@@ -57,7 +58,7 @@ class NArray {
   /*! \return whether this narray is not initialized */
   inline bool is_empty() const {
     return ptr_.get() == nullptr;
-  }  
+  }
 
  private:
   /*! \brief the real data chunk that backs NArray */
@@ -79,7 +80,7 @@ class NArray {
     Chunk() : static_data(true), delay_alloc(false) {
       var  = DAGEngine::Get()->NewVar();
     }
-    /*! \brief construct from static data */    
+    /*! \brief construct from static data */
     Chunk(const TBlob &data, int dev_id)
         : data(data),
           static_data(true),
@@ -118,14 +119,14 @@ class NArray {
   /*! \brief internal data of NArray */
   std::shared_ptr<Chunk> ptr_;
   /*!
-   * \brief constructing a new dynamic NArray 
+   * \brief constructing a new dynamic NArray
    * \param shape the shape of array
    * \param ctx context of NArray
    * \param delay_alloc whether delay the allocation
    */
   NArray(const TShape &shape, Context ctx, bool delay_alloc)
       : ptr_(new Chunk(shape, ctx, delay_alloc)) {
-  }  
+  }
   // add friend to helper functions
   template<typename OP>
   friend NArray BinaryEWise(const NArray &lhs, const NArray &rhs);
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index da371dc9b326..3107ce89f5c2 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -80,7 +80,7 @@ class Operator {
                         const std::vector<TBlob> &grad_next,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_grad,
-                        const std::vector<GradReqType> req);
+                        const std::vector<GradReqType> &req);
 };
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_H_
diff --git a/src/dag_engine/simple_engine.cc b/src/dag_engine/simple_engine.cc
index 2e35b2ff57fc..9ea42e979735 100644
--- a/src/dag_engine/simple_engine.cc
+++ b/src/dag_engine/simple_engine.cc
@@ -1,19 +1,18 @@
 #include <dmlc/logging.h>
 #include <mxnet/dag_engine.h>
-
 namespace mxnet {
 class SimpleEngine : public DAGEngine {
  public:
   virtual void Push(AsyncOp exec_fun,
                     Context exec_ctx,
-                    const std::vector<Variable> &use_vars, 
+                    const std::vector<Variable> &use_vars,
                     const std::vector<Variable> &mutate_vars) {
     // cannot schedule async using naive way because deps are not captured
     LOG(FATAL) << "cannot schedule async operations";
   }
   virtual void Push(Op exec_fun,
                     Context exec_ctx,
-                    const std::vector<Variable> &use_vars, 
+                    const std::vector<Variable> &use_vars,
                     const std::vector<Variable> &mutate_vars) {
     exec_fun(RunContext());
   }
@@ -25,7 +24,7 @@ class SimpleEngine : public DAGEngine {
     // that have the info about the variable
     // use ptr directly instead of ID because this avoids an indirect mapping
     return NULL;
-  }  
+  }
 };
 // implements the singleton factory
 DAGEngine* DAGEngine::Get() {
diff --git a/src/narray/narray_op-inl.h b/src/narray/narray_op-inl.h
index 918149ff298b..9891d9a993d0 100644
--- a/src/narray/narray_op-inl.h
+++ b/src/narray/narray_op-inl.h
@@ -19,7 +19,7 @@ namespace mxnet {
 namespace narray {
 // true implementation
 template<typename xpu, typename OP>
-inline void Eval_(const TBlob &lhs, const TBlob &rhs, TBlob ret, RunContext ctx) {  
+inline void Eval_(const TBlob &lhs, const TBlob &rhs, TBlob ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = static_cast<mshadow::Stream<xpu>*>(ctx.stream);
   ret.FlatTo2D<xpu, real_t>(s)
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 32b848846f70..ac035b395b21 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -4,8 +4,8 @@
  * \brief extra mshadow operation for mxnet
  * \author Bing Xu
  */
-#ifndef SRC_OPERATOR_OP_H_
-#define SRC_OPERATOR_OP_H_
+#ifndef MXNET_MSHADOW_OPERATOR_OP_H_
+#define MXNET_MSHADOW_OPERATOR_OP_H_
 #pragma once
 
 #include <algorithm>
@@ -38,7 +38,7 @@ struct sigmoid_grad {
 /*! \brief Rectified Linear Operation */
 struct relu {
   MSHADOW_XINLINE static real_t Map(real_t a) {
-    return std::max(a, 0.0f);
+    return a > 0.0f ? a : 0.0f;
   }
 };
 struct relu_grad {
@@ -103,7 +103,7 @@ struct square_root {
 }  // namespace op
 }  // namespace mxnet
 
-#endif  // SRC_OPERATOR_OP_H_
+#endif  // MXNET_MSHADOW_OPERATOR_OP_H_
 
 
 

From 043bfff06fab6e8c50b5a9511cfcbae137d254df Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 20 Jun 2015 16:11:00 -0600
Subject: [PATCH 06/12] act

---
 src/operator/activation_op-inl.h | 40 ++++++++++++++------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/src/operator/activation_op-inl.h b/src/operator/activation_op-inl.h
index 7ac0ddc87a53..a96ce813ea9b 100644
--- a/src/operator/activation_op-inl.h
+++ b/src/operator/activation_op-inl.h
@@ -4,13 +4,13 @@
  * \brief activation operator of mxnet
  */
 
-#ifndef SRC_OPERATOR_ACTIVATION_OP_INL_HPP_
-#define SRC_OPERATOR_ACTIVATION_OP_INL_HPP_
-#pragma once
+#ifndef MXNET_ACTIVATION_OP_INL_HPP_
+#define MXNET_ACTIVATION_OP_INL_HPP_
 #include <mxnet/operator.h>
 #include <vector>
 
 namespace mxnet {
+namespace op {
 template<typename xpu, typename ForwardOp, typename BackOp>
 class ActivationOp : public Operator {
  public:
@@ -24,14 +24,12 @@ class ActivationOp : public Operator {
                        RunContext ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<TBlob> &out_data) {
-    CHECK(out_data.size() == 1) << \
-                           "Activation Op: only 1 output data is allowed";
-    CHECK(in_data.size() == 1) << \
-                          "Activation Op: only 1 input data is allowed";
+    CHECK(out_data.size() == 1);
+    CHECK(in_data.size() == 1);
     mshadow::Stream<xpu> *stream = \
       static_cast<mshadow::Stream<xpu> *>(ctx.stream);
-    mshadow::Tensor<xpu, 2> in = in_data[0].FlatTo2D(stream);
-    mshadow::Tensor<xpu, 2> out = out_data[0].FlatTo2D(stream);
+    mshadow::Tensor<xpu, 2> in = in_data[0].FlatTo2D<xpu, real_t>(stream);
+    mshadow::Tensor<xpu, 2> out = out_data[0].FlatTo2D<xpu, real_t>(stream);
     out = mshadow::expr::F<ForwardOp>(in);
   }
   virtual void Backward(Option opt,
@@ -39,24 +37,22 @@ class ActivationOp : public Operator {
                         const std::vector<TBlob> &grad_next,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_grad,
-                        const std::vector<GradReqType> req) {
-    CHECK(grad_next.size() == 1) << \
-                            "Activation Op: only 1 input grad is allowed";
-    CHECK(in_data.size() == 1) << \
-                          "Activation Op: only 1 input data is allowed";
-    CHECK(req.size() == 1) << \
-                      "Activation Op: only 1 req is allowed";
-    CHECK(req[0] == kWriteInplace) << \
-                  "Activation Op: only support inplace mode";
+                        const std::vector<GradReqType> &req) {
+    CHECK(grad_next.size() == 1);
+    CHECK(in_data.size() == 1);
+    CHECK(out_grad.size() == 1);
+    CHECK(req.size() == 1);
+    CHECK(req[0] == kWriteInplace);
     mshadow::Stream<xpu> *stream = \
       static_cast<mshadow::Stream<xpu> *>(ctx.stream);
-    mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D(stream);
-    mshadow::Tensor<xpu, 2> data = in_data[0].FlatTo2D(stream);
-    data = mshadow::expr::F<BackOp>(data) * grad;
+    mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D<xpu, real_t>(stream);
+    mshadow::Tensor<xpu, 2> data = in_data[0].FlatTo2D<xpu, real_t>(stream);
+    Assign(mshadow::expr::F<BackOp>(data) * grad, data, req[0]);
   }
 };  // class ActivationOp
+}  // namespace op
 }  // namespace mxnet
 
-#endif  // SRC_OPERATOR_ACTIVATION_OP_INL_HPP_
+#endif  // MXNET_ACTIVATION_OP_INL_HPP_
 
 

From e595baf3e02354ede6039728c9bda20730e9147d Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 20 Jun 2015 23:30:43 -0600
Subject: [PATCH 07/12] add fullc op

---
 include/mxnet/operator.h            |  25 +++++--
 src/operator/activation_op-inl.h    |  15 ++--
 src/operator/assign_helper.h        |  29 ++++++++
 src/operator/fully_connect_op-inl.h | 111 ++++++++++++++++++++++++++++
 src/operator/operator.cc            |   6 +-
 src/operator/operator_helper.h      |  20 +----
 src/operator/param.h                |   6 +-
 7 files changed, 181 insertions(+), 31 deletions(-)
 create mode 100644 src/operator/assign_helper.h
 create mode 100644 src/operator/fully_connect_op-inl.h

diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 3107ce89f5c2..b5ddf919caac 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -26,8 +26,6 @@ class Operator {
   struct Option {
     /*! \brief whether it is training phase*/
     int is_train;
-    /*! \brief whether propagate gradient to x in backprop */
-    int prop_grad;
   };
   /*! \briref gradient request type the request can have */
   enum GradReqType {
@@ -40,6 +38,19 @@ class Operator {
     /*! \brief add to the provided space */
     kAddTo = 3
   };
+  /*! \brief argument request type the request can have */
+  enum ArgReqType {
+    /*! \brief weight arg*/
+    kWeightArg = 0,
+    /*! \brief bias arg*/
+    kBiasArg = 1,
+    /*! \brief data args */
+    kDataArg = 2,
+  };
+  /*! \brief get request input arguments
+   *  \param args empty vector of reqest argument type
+   */
+  virtual void DescribeArgs(std::vector<ArgReqType> *args) = 0;
   /*!
    * \brief set param for the operator from string
    * \param name parameter name
@@ -49,9 +60,14 @@ class Operator {
   /*!
    * \brief inter the shape of output given the input data
    * \param in_shape the shape of input arguments of the operator
+   *                 For unknown shape, left TShape size to 0,
+   *                 InferShape will try to fix a correct shape;
+   *                 For known shape, InferShape will check shape
+   *
    * \param out_shape the shape of outputs of the operator
+   *                  InferShape will modify the vector to fill output TShape
    */
-  virtual void InferShape(const std::vector<TShape> &in_shape,
+  virtual void InferShape(std::vector<TShape> &in_shape,
                           std::vector<TShape> *out_shape) = 0;
   /*!
    * \brief perform a forward operation of operator, save the output to TBlob
@@ -75,8 +91,7 @@ class Operator {
    * \param req_types request types of the gradient saving operation
    * \sa GradReqType
    */
-  virtual void Backward(Option opt,
-                        RunContext ctx,
+  virtual void Backward(RunContext ctx,
                         const std::vector<TBlob> &grad_next,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_grad,
diff --git a/src/operator/activation_op-inl.h b/src/operator/activation_op-inl.h
index a96ce813ea9b..b46319e44c69 100644
--- a/src/operator/activation_op-inl.h
+++ b/src/operator/activation_op-inl.h
@@ -8,15 +8,21 @@
 #define MXNET_ACTIVATION_OP_INL_HPP_
 #include <mxnet/operator.h>
 #include <vector>
+#include "./assign_helper.h"
 
 namespace mxnet {
 namespace op {
 template<typename xpu, typename ForwardOp, typename BackOp>
 class ActivationOp : public Operator {
  public:
-  virtual void InferShape(const std::vector<TShape> &in_shape,
+  virtual void DescribeArgs(std::vector<ArgReqType> *args) {
+      args->clear();
+      args->push_back(kDataArg);
+  }
+  virtual void InferShape(std::vector<TShape> &in_shape,
                           std::vector<TShape> *out_shape) {
-    CHECK(in_shape.size() == 1) << "Activation Op: only 1 input is allowed";
+    CHECK(in_shape.size() == 1) << "Only 1 input is allowed";
+    CHECK(in_shape[0].Size() > 0) << "Must set input data shape";
     TShape out = in_shape[0];
     out_shape->push_back(out);
   }
@@ -32,8 +38,7 @@ class ActivationOp : public Operator {
     mshadow::Tensor<xpu, 2> out = out_data[0].FlatTo2D<xpu, real_t>(stream);
     out = mshadow::expr::F<ForwardOp>(in);
   }
-  virtual void Backward(Option opt,
-                        RunContext ctx,
+  virtual void Backward(RunContext ctx,
                         const std::vector<TBlob> &grad_next,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_grad,
@@ -47,7 +52,7 @@ class ActivationOp : public Operator {
       static_cast<mshadow::Stream<xpu> *>(ctx.stream);
     mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D<xpu, real_t>(stream);
     mshadow::Tensor<xpu, 2> data = in_data[0].FlatTo2D<xpu, real_t>(stream);
-    Assign(mshadow::expr::F<BackOp>(data) * grad, data, req[0]);
+    Assign(data, mshadow::expr::F<BackOp>(data) * grad, req[0]);
   }
 };  // class ActivationOp
 }  // namespace op
diff --git a/src/operator/assign_helper.h b/src/operator/assign_helper.h
new file mode 100644
index 000000000000..8926add161d0
--- /dev/null
+++ b/src/operator/assign_helper.h
@@ -0,0 +1,29 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file assign_helper.h
+ * \brief assign gradient
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_ASSIGN_HELPER_H_
+#define MXNET_OPERATOR_ASSIGN_HELPER_H_
+namespace mxnet {
+namespace op {
+template<typename xpu, typename Exp, int dim, typename DType>
+inline void Assign(mshadow::Tensor<xpu, dim, DType> &out,
+                   const Exp &exp,
+                   const Operator::GradReqType &req) {
+  switch (req) {
+    case Operator::kNullOp:
+      break;
+    case Operator::kWriteTo:
+    case Operator::kWriteInplace:
+      out = exp;
+      break;
+    case Operator::kAddTo:
+      out += exp;
+      break;
+  }
+}
+}  //namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_ASSIGN_HELPER_H_
diff --git a/src/operator/fully_connect_op-inl.h b/src/operator/fully_connect_op-inl.h
new file mode 100644
index 000000000000..e6e39010ec1e
--- /dev/null
+++ b/src/operator/fully_connect_op-inl.h
@@ -0,0 +1,111 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file fully_connect_op-inl.hpp
+ * \brief fully connect operator
+ * \author Bing Xu
+*/
+
+#ifndef MXNET_FULLY_CONNECT_OP_INL_HPP_
+#define MXNET_FULLY_CONNECT_OP_INL_HPP_
+
+#include <mxnet/operator.h>
+#include <vector>
+#include "./assign_helper.h"
+#include "./param.h"
+
+namespace mxnet {
+namespace op {
+template<typename xpu>
+class FullyConnectOp : public Operator {
+ public:
+  virtual void DescribeArgs(std::vector<ArgReqType> *args) {
+    args->clear();
+    args->push_back(kDataArg);
+    args->push_back(kWeightArg);
+    args->push_back(kBiasArg);
+  }
+  virtual void SetParam(const char *name, const char *val) {
+    param_.SetParam(name, val);
+  }
+  virtual void InferShape(std::vector<TShape> &in_shape,
+                          std::vector<TShape> *out_shape) {
+    CHECK(in_shape.size() == 3) << "Input:[data, weight, bias]";
+    CHECK(param_.num_input_node > 0);
+    CHECK(param_.num_hidden > 0);
+    TShape &dshape = in_shape[0];
+    TShape &wshape = in_shape[1];
+    TShape &bshape = in_shape[2];
+    if (wshape.Size() == 0) {
+      mshadow::Shape<2> ws = mshadow::Shape2(param_.num_hidden,
+                                             param_.num_input_node);
+      wshape = ws;
+    } else {
+      CHECK(wshape[0] == param_.num_hidden);
+      CHECK(wshape[1] == param_.num_input_node);
+    }
+    if (bshape.Size() == 0) {
+      mshadow::Shape<1> bs = mshadow::Shape1(param_.num_hidden);
+      bshape = bs;
+    } else {
+      CHECK(bshape[0] == param_.num_hidden);
+    }
+    CHECK(dshape.ndim() == 4 && dshape[3] == param_.num_input_node) << \
+                         "Input data should be 4D in batch-1-1-hidden";
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    out_shape->at(0)[3] = param_.num_hidden;
+  }
+  virtual void Forward(Option opt,
+                       RunContext ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<TBlob> &out_data) {
+    CHECK(in_data.size() == 3) << "Input:[data, weight, bias]";
+    CHECK(out_data.size() == 1);
+    mshadow::Stream<xpu> *stream = \
+      static_cast<mshadow::Stream<xpu> *>(ctx.stream);
+    mshadow::Tensor<xpu, 2> wmat = in_data[0].get<xpu, 2, real_t>(stream);
+    mshadow::Tensor<xpu, 1> bias = in_data[1].get<xpu, 1, real_t>(stream);
+    mshadow::Tensor<xpu, 2> data = in_data[2].FlatTo2D<xpu, real_t>(stream);
+    mshadow::Tensor<xpu, 2> out = out_data[0].FlatTo2D<xpu, real_t>(stream);
+    out = mshadow::expr::dot(data, wmat.T());
+    if (!param_.no_bias) {
+      out += mshadow::expr::repmat(bias, data.size(0));
+    }
+  }
+  virtual void Backward(RunContext ctx,
+                        const std::vector<TBlob> &grad_next,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<GradReqType> &req) {
+    CHECK(grad_next.size() == 1);
+    CHECK(in_data.size() == 3) << "Input: [data, weight, bias]";
+    CHECK(out_grad.size() == 3) << "Output: [gdata, gweight, gbias]";
+    CHECK(req.size() == 3);
+    mshadow::Stream<xpu> *stream = \
+      static_cast<mshadow::Stream<xpu> *>(ctx.stream);
+    mshadow::Tensor<xpu, 2> data = in_data[0].FlatTo2D<xpu, real_t>(stream);
+    mshadow::Tensor<xpu, 2> wmat = in_data[1].get<xpu, 2, real_t>(stream);
+    mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D<xpu, real_t>(stream);
+    mshadow::Tensor<xpu, 2> gdata = out_grad[0].FlatTo2D<xpu, real_t>(stream);
+    mshadow::Tensor<xpu, 2> gwmat = out_grad[1].get<xpu, 2, real_t>(stream);
+    mshadow::Tensor<xpu, 1> gbias = out_grad[2].get<xpu, 1, real_t>(stream);
+    //  backprop
+    CHECK(req[0] != kWriteInplace);
+    Assign(gwmat, mshadow::expr::dot(grad.T(), data), req[0]);
+    if (!param_.no_bias) {
+      Assign(gbias, mshadow::expr::sum_rows(grad), req[1]);
+    }
+    if (req[0] != kNullOp) {
+      CHECK(req[0] != kWriteInplace);
+      Assign(gdata, mshadow::expr::dot(grad, wmat), req[2]);
+    }
+  }
+ private:
+  Param param_;
+};  // class FullyConnectOp
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_FULLY_CONNECT_OP_INL_HPP
+
+
diff --git a/src/operator/operator.cc b/src/operator/operator.cc
index bee6238d8bce..1cab17a62a67 100644
--- a/src/operator/operator.cc
+++ b/src/operator/operator.cc
@@ -9,10 +9,10 @@
 namespace mxnet {
 namespace op {
 
-Operator * CreateOperator(OpType type) {
+Operator *CreateOperator(OpType type) {
   return OperatorFactory<cpu>(type);
 }
 
-} // namespace op
-} // namespace mxnet
+}  // namespace op
+}  // namespace mxnet
 
diff --git a/src/operator/operator_helper.h b/src/operator/operator_helper.h
index b2e2ec0b5050..9a673b12cedb 100644
--- a/src/operator/operator_helper.h
+++ b/src/operator/operator_helper.h
@@ -7,6 +7,7 @@
 #ifndef MXNET_OPERATOR_HELPER_H_
 #define MXNET_OPERATOR_HELPER_H_
 #include "activation_op-inl.h"
+#include "fully_connect_op-inl.h"
 #include "mshadow_op.h"
 
 namespace mxnet {
@@ -14,29 +15,16 @@ namespace op {
 
 enum OpType {
   kReLU = 0,
+  kFullc = 1,
 };
 
-
-template<typename xpu, typename Exp>
-inline void Assign(const Exp &exp,
-            const mshadow::Tensor<xpu,2> &out,
-            const Operator::GradReqType &req) {
-  switch (req) {
-    case Operator::kNullOp:
-      break;
-    case Operator::kWriteTo:
-    case Operator::kWriteInplace:
-      break;
-    case Operator::kAddTo:
-      break;
-  }
-}
-
 template<typename xpu>
 Operator *OperatorFactory(OpType type) {
   switch (type) {
     case kReLU:
       return new ActivationOp<xpu, relu, relu_grad>();
+    case kFullc:
+      return new FullyConnectOp<xpu>();
 
   };
   return NULL;
diff --git a/src/operator/param.h b/src/operator/param.h
index 336c833165f8..c17555898c0a 100644
--- a/src/operator/param.h
+++ b/src/operator/param.h
@@ -41,13 +41,15 @@ struct Param {
   int reserved[64];
   inline void SetParam(const char *name, const char* val) {
     if (!strcmp(name, "nhidden")) num_hidden = atoi(val);
+    if (!strcmp(name, "num_input_node")) num_input_node = atoi(val);
+    if (!strcmp(name, "num_input_channel")) num_input_channel = atoi(val);
     if (!strcmp(name, "nchannel")) num_channel = atoi(val);
     if (!strcmp(name, "ngroup")) num_group = atoi(val);
     if (!strcmp(name, "kernel_size")) {
       kernel_y = kernel_x = atoi(val);
     }
-    if (!strcmp(name, "kernel_height")) kernel_height = atoi(val);
-    if (!strcmp(name, "kernel_width")) kernel_width = atoi(val);
+    if (!strcmp(name, "kernel_height")) kernel_y = atoi(val);
+    if (!strcmp(name, "kernel_width")) kernel_x = atoi(val);
     if (!strcmp(name, "stride")) {
       stride_y = stride_x = atoi(val);
     }

From 3a76b8d720165db3d6b8a91b4f33cfe1d27e8111 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 20 Jun 2015 23:33:17 -0600
Subject: [PATCH 08/12] minor

---
 src/operator/activation_op-inl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/operator/activation_op-inl.h b/src/operator/activation_op-inl.h
index b46319e44c69..e49ad2097b12 100644
--- a/src/operator/activation_op-inl.h
+++ b/src/operator/activation_op-inl.h
@@ -52,7 +52,8 @@ class ActivationOp : public Operator {
       static_cast<mshadow::Stream<xpu> *>(ctx.stream);
     mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D<xpu, real_t>(stream);
     mshadow::Tensor<xpu, 2> data = in_data[0].FlatTo2D<xpu, real_t>(stream);
-    Assign(data, mshadow::expr::F<BackOp>(data) * grad, req[0]);
+    mshadow::Tensor<xpu, 2> out = in_data[0].FlatTo2D<xpu, real_t>(stream);
+    Assign(out, mshadow::expr::F<BackOp>(data) * grad, req[0]);
   }
 };  // class ActivationOp
 }  // namespace op

From be5bf08d563ffbebe3f8b8fd6d2b1f57d97316cc Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sun, 21 Jun 2015 11:00:56 -0600
Subject: [PATCH 09/12] minor

---
 Makefile                            |   9 ++-
 include/mxnet/operator.h            |  53 +++++++-----
 src/operator/activation_op-inl.h    |  32 ++++----
 src/operator/assign_helper.h        |  29 -------
 src/operator/fully_connect_op-inl.h | 121 ++++++++++++++--------------
 src/operator/mshadow_op.h           |   7 +-
 src/operator/operator-inl.h         |  35 ++++++++
 src/operator/operator.cc            |  43 +++++++---
 src/operator/operator.cu            |  20 -----
 src/operator/operator_common.h      |  67 +++++++++++++++
 src/operator/operator_cpu.cc        |  18 +++++
 src/operator/operator_gpu.cu        |  21 +++++
 src/operator/operator_helper.h      |  35 --------
 src/operator/param.h                |   2 +-
 14 files changed, 288 insertions(+), 204 deletions(-)
 delete mode 100644 src/operator/assign_helper.h
 create mode 100644 src/operator/operator-inl.h
 delete mode 100644 src/operator/operator.cu
 create mode 100644 src/operator/operator_common.h
 create mode 100644 src/operator/operator_cpu.cc
 create mode 100644 src/operator/operator_gpu.cu
 delete mode 100644 src/operator/operator_helper.h

diff --git a/Makefile b/Makefile
index 418533577dec..b159e0bc9429 100644
--- a/Makefile
+++ b/Makefile
@@ -47,8 +47,8 @@ ifneq ($(ADD_LDFLAGS), NONE)
 	LDFLAGS += $(ADD_LDFLAGS)
 endif
 
-OBJ = storage.o narray_op_cpu.o
-OBJCXX11 = engine.o narray.o operator.o
+OBJ = storage.o narray_op_cpu.o operator.o operator_cpu.o
+OBJCXX11 = engine.o narray.o
 CUOBJ = narray_op_gpu.o operator_gpu.o
 
 LIB_DEP = $(DMLC_CORE)/libdmlc.a
@@ -66,7 +66,8 @@ narray.o: src/narray/narray.cc
 narray_op_cpu.o: src/narray/narray_op_cpu.cc src/narray/narray_op-inl.h
 narray_op_gpu.o: src/narray/narray_op_gpu.cu src/narray/narray_op-inl.h
 operator.o: src/operator/operator.cc
-operator_gpu.o: src/operator/operator.cu
+operator_cpu.o: src/operator/operator_cpu.cc
+operator_gpu.o: src/operator/operator_gpu.cu
 
 $(BIN) :
 	$(CXX) $(CFLAGS)  -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
@@ -81,7 +82,7 @@ $(SLIB) :
 	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
 
 $(CUOBJ) :
-	$(NVCC) --std=c++11 -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^)
+	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^)
 
 $(CUBIN) :
 	$(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^)
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index b5ddf919caac..c0179990058d 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -1,13 +1,13 @@
 /*!
  *  Copyright (c) 2015 by Contributors
  * \file operator.h
- * \brief operator interface of mxnet
+ * \brief static operator interface of mxnet
  */
 #ifndef MXNET_OPERATOR_H_
 #define MXNET_OPERATOR_H_
+// this file will be seen by cuda, no c++11 for now
 #include <dmlc/base.h>
 #include "./base.h"
-#include "./narray.h"
 #include "./tensor_blob.h"
 
 namespace mxnet {
@@ -38,19 +38,24 @@ class Operator {
     /*! \brief add to the provided space */
     kAddTo = 3
   };
-  /*! \brief argument request type the request can have */
-  enum ArgReqType {
-    /*! \brief weight arg*/
-    kWeightArg = 0,
-    /*! \brief bias arg*/
-    kBiasArg = 1,
-    /*! \brief data args */
-    kDataArg = 2,
+  /*! \brief input argument type of the operator have */
+  enum ArgType {
+    /*! \brief data argument */
+    kDataArg = 0,
+    /*! \brief weight argument */
+    kWeightArg = 1,
+    /*! \brief bias argument */
+    kBiasArg = 2
   };
-  /*! \brief get request input arguments
-   *  \param args empty vector of reqest argument type
+  /*!
+   * \brief get types of input argument of this oeprator
+   * \return a vector corresponding to type of each argument
+   *  this order is same as the order of inputs in Forward, InferShape and Backward
    */
-  virtual void DescribeArgs(std::vector<ArgReqType> *args) = 0;
+  virtual std::vector<ArgType> DescribeArgs() const {
+    // default most of layers only have one data argument
+    return std::vector<ArgType>(1, kDataArg);
+  }
   /*!
    * \brief set param for the operator from string
    * \param name parameter name
@@ -58,16 +63,19 @@ class Operator {
    */
   virtual void SetParam(const char *name, const char *val) {}
   /*!
-   * \brief inter the shape of output given the input data
+   * \brief inter the shapes of outputs and unknown input arguments
    * \param in_shape the shape of input arguments of the operator
-   *                 For unknown shape, left TShape size to 0,
-   *                 InferShape will try to fix a correct shape;
-   *                 For known shape, InferShape will check shape
+   *     this should be of same length as the vector returned by DescribeArgs
+   *     in_shape allows unknown elements, which are checked by shape.ndim() == 0.
+   *     For unknown shapes, InferShape will try to fill in the correct Shape in in_shape
+   *     For known shapes, InferShape will check shape consistency
+   *  
+   *     common practice: set the shape of data input, and usually weight's shape can be infered
    *
    * \param out_shape the shape of outputs of the operator
-   *                  InferShape will modify the vector to fill output TShape
+   *     InferShape will modify the vector to fill output TShape
    */
-  virtual void InferShape(std::vector<TShape> &in_shape,
+  virtual void InferShape(std::vector<TShape> *in_shape,
                           std::vector<TShape> *out_shape) = 0;
   /*!
    * \brief perform a forward operation of operator, save the output to TBlob
@@ -96,6 +104,13 @@ class Operator {
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<GradReqType> &req);
+  
+  /*!
+   * \brief factory unction, create a new operator
+   * \param type the type of operator
+   * \param ctx the context device type of operator
+   */
+  static Operator *Create(const char *type, Context ctx);
 };
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_H_
diff --git a/src/operator/activation_op-inl.h b/src/operator/activation_op-inl.h
index e49ad2097b12..0e87020ea1c6 100644
--- a/src/operator/activation_op-inl.h
+++ b/src/operator/activation_op-inl.h
@@ -1,30 +1,28 @@
 /*!
  *  Copyright (c) 2015 by Contributors
- * \file activation_op-inl.hpp
+ * \file activation_op-inl.h
  * \brief activation operator of mxnet
  */
 
-#ifndef MXNET_ACTIVATION_OP_INL_HPP_
-#define MXNET_ACTIVATION_OP_INL_HPP_
-#include <mxnet/operator.h>
+#ifndef MXNET_OPERATOR_ACTIVATION_OP_INL_H_
+#define MXNET_OPERATOR_ACTIVATION_OP_INL_H_
+
 #include <vector>
-#include "./assign_helper.h"
+#include <dmlc/logging.h>
+#include <mxnet/operator.h>
+#include "./operator_common.h"
 
 namespace mxnet {
 namespace op {
 template<typename xpu, typename ForwardOp, typename BackOp>
 class ActivationOp : public Operator {
  public:
-  virtual void DescribeArgs(std::vector<ArgReqType> *args) {
-      args->clear();
-      args->push_back(kDataArg);
-  }
-  virtual void InferShape(std::vector<TShape> &in_shape,
+  virtual void InferShape(std::vector<TShape> *in_shape,
                           std::vector<TShape> *out_shape) {
-    CHECK(in_shape.size() == 1) << "Only 1 input is allowed";
-    CHECK(in_shape[0].Size() > 0) << "Must set input data shape";
-    TShape out = in_shape[0];
-    out_shape->push_back(out);
+    CHECK(in_shape->size() == 1) << "Only 1 input is allowed";
+    CHECK((*in_shape)[0].ndim() != 0 ) << "Require data shape to be known";
+    out_shape->clear();
+    out_shape->push_back((*in_shape)[0]);
   }
   virtual void Forward(Option opt,
                        RunContext ctx,
@@ -53,12 +51,10 @@ class ActivationOp : public Operator {
     mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D<xpu, real_t>(stream);
     mshadow::Tensor<xpu, 2> data = in_data[0].FlatTo2D<xpu, real_t>(stream);
     mshadow::Tensor<xpu, 2> out = in_data[0].FlatTo2D<xpu, real_t>(stream);
-    Assign(out, mshadow::expr::F<BackOp>(data) * grad, req[0]);
+    Assign(out, req[0], mshadow::expr::F<BackOp>(data) * grad);
   }
 };  // class ActivationOp
 }  // namespace op
 }  // namespace mxnet
 
-#endif  // MXNET_ACTIVATION_OP_INL_HPP_
-
-
+#endif  // MXNET_OPERATOR_ACTIVATION_OP_INL_H_
diff --git a/src/operator/assign_helper.h b/src/operator/assign_helper.h
deleted file mode 100644
index 8926add161d0..000000000000
--- a/src/operator/assign_helper.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file assign_helper.h
- * \brief assign gradient
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_ASSIGN_HELPER_H_
-#define MXNET_OPERATOR_ASSIGN_HELPER_H_
-namespace mxnet {
-namespace op {
-template<typename xpu, typename Exp, int dim, typename DType>
-inline void Assign(mshadow::Tensor<xpu, dim, DType> &out,
-                   const Exp &exp,
-                   const Operator::GradReqType &req) {
-  switch (req) {
-    case Operator::kNullOp:
-      break;
-    case Operator::kWriteTo:
-    case Operator::kWriteInplace:
-      out = exp;
-      break;
-    case Operator::kAddTo:
-      out += exp;
-      break;
-  }
-}
-}  //namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_ASSIGN_HELPER_H_
diff --git a/src/operator/fully_connect_op-inl.h b/src/operator/fully_connect_op-inl.h
index e6e39010ec1e..a7f07601b374 100644
--- a/src/operator/fully_connect_op-inl.h
+++ b/src/operator/fully_connect_op-inl.h
@@ -1,16 +1,16 @@
 /*!
  * Copyright (c) 2015 by Contributors
- * \file fully_connect_op-inl.hpp
+ * \file fully_connect_op-inl.h
  * \brief fully connect operator
  * \author Bing Xu
 */
+#ifndef MXNET_OPERATOR_FULLY_CONNECT_OP_INL_H_
+#define MXNET_OPERATOR_FULLY_CONNECT_OP_INL_H_
 
-#ifndef MXNET_FULLY_CONNECT_OP_INL_HPP_
-#define MXNET_FULLY_CONNECT_OP_INL_HPP_
-
-#include <mxnet/operator.h>
 #include <vector>
-#include "./assign_helper.h"
+#include <dmlc/logging.h>
+#include <mxnet/operator.h>
+#include "./operator_common.h"
 #include "./param.h"
 
 namespace mxnet {
@@ -18,58 +18,55 @@ namespace op {
 template<typename xpu>
 class FullyConnectOp : public Operator {
  public:
-  virtual void DescribeArgs(std::vector<ArgReqType> *args) {
-    args->clear();
-    args->push_back(kDataArg);
-    args->push_back(kWeightArg);
-    args->push_back(kBiasArg);
+  virtual std::vector<ArgType> DescribeArgs() const {
+    ArgType ret[] = {kDataArg, kWeightArg, kBiasArg};
+    if (param_.no_bias == 0) {
+      return std::vector<ArgType>(ret, ret + 3);
+    } else {
+      return std::vector<ArgType>(ret, ret + 2);
+    }
   }
   virtual void SetParam(const char *name, const char *val) {
     param_.SetParam(name, val);
   }
-  virtual void InferShape(std::vector<TShape> &in_shape,
+  virtual void InferShape(std::vector<TShape> *in_shape,
                           std::vector<TShape> *out_shape) {
-    CHECK(in_shape.size() == 3) << "Input:[data, weight, bias]";
-    CHECK(param_.num_input_node > 0);
-    CHECK(param_.num_hidden > 0);
-    TShape &dshape = in_shape[0];
-    TShape &wshape = in_shape[1];
-    TShape &bshape = in_shape[2];
-    if (wshape.Size() == 0) {
-      mshadow::Shape<2> ws = mshadow::Shape2(param_.num_hidden,
-                                             param_.num_input_node);
-      wshape = ws;
+    using namespace mshadow;
+    if (param_.no_bias == 0) {
+      CHECK(in_shape->size() == 3) << "Input:[data, weight, bias]";
     } else {
-      CHECK(wshape[0] == param_.num_hidden);
-      CHECK(wshape[1] == param_.num_input_node);
+      CHECK(in_shape->size() == 2) << "Input:[data, weight]";
     }
-    if (bshape.Size() == 0) {
-      mshadow::Shape<1> bs = mshadow::Shape1(param_.num_hidden);
-      bshape = bs;
-    } else {
-      CHECK(bshape[0] == param_.num_hidden);
+    CHECK(param_.num_hidden > 0);
+    const TShape &dshape = (*in_shape)[0];
+    CHECK(dshape.ndim() == 4) << \
+        "Input data should be 4D in batch-1-1-hidden";
+    CHECK(dshape.ndim() != 0) << "Require data shape to be known";
+    ShapeAssignCheck((*in_shape)[1], Shape2(param_.num_hidden, dshape[3]));
+    if (param_.no_bias == 0) {
+      ShapeAssignCheck((*in_shape)[2], Shape1(param_.num_hidden));
     }
-    CHECK(dshape.ndim() == 4 && dshape[3] == param_.num_input_node) << \
-                         "Input data should be 4D in batch-1-1-hidden";
     out_shape->clear();
     out_shape->push_back(dshape);
-    out_shape->at(0)[3] = param_.num_hidden;
+    (*out_shape)[0][3] = param_.num_hidden;
   }
   virtual void Forward(Option opt,
                        RunContext ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<TBlob> &out_data) {
-    CHECK(in_data.size() == 3) << "Input:[data, weight, bias]";
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    CHECK(in_data.size() == expected);
     CHECK(out_data.size() == 1);
-    mshadow::Stream<xpu> *stream = \
-      static_cast<mshadow::Stream<xpu> *>(ctx.stream);
-    mshadow::Tensor<xpu, 2> wmat = in_data[0].get<xpu, 2, real_t>(stream);
-    mshadow::Tensor<xpu, 1> bias = in_data[1].get<xpu, 1, real_t>(stream);
-    mshadow::Tensor<xpu, 2> data = in_data[2].FlatTo2D<xpu, real_t>(stream);
-    mshadow::Tensor<xpu, 2> out = out_data[0].FlatTo2D<xpu, real_t>(stream);
-    out = mshadow::expr::dot(data, wmat.T());
-    if (!param_.no_bias) {
-      out += mshadow::expr::repmat(bias, data.size(0));
+    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
+    Tensor<xpu, 2> data = in_data[0].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> wmat = in_data[1].get<xpu, 2, real_t>(s);
+    Tensor<xpu, 2> out = out_data[0].FlatTo2D<xpu, real_t>(s);
+    out = dot(data, wmat.T());
+    if (param_.no_bias == 0) {
+      Tensor<xpu, 1> bias = in_data[2].get<xpu, 1, real_t>(s);
+      out += repmat(bias, data.size(0));
     }
   }
   virtual void Backward(RunContext ctx,
@@ -77,28 +74,29 @@ class FullyConnectOp : public Operator {
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<GradReqType> &req) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
     CHECK(grad_next.size() == 1);
-    CHECK(in_data.size() == 3) << "Input: [data, weight, bias]";
-    CHECK(out_grad.size() == 3) << "Output: [gdata, gweight, gbias]";
+    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    CHECK(in_data.size() == expected && out_grad.size() == expected);
     CHECK(req.size() == 3);
-    mshadow::Stream<xpu> *stream = \
-      static_cast<mshadow::Stream<xpu> *>(ctx.stream);
-    mshadow::Tensor<xpu, 2> data = in_data[0].FlatTo2D<xpu, real_t>(stream);
-    mshadow::Tensor<xpu, 2> wmat = in_data[1].get<xpu, 2, real_t>(stream);
-    mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D<xpu, real_t>(stream);
-    mshadow::Tensor<xpu, 2> gdata = out_grad[0].FlatTo2D<xpu, real_t>(stream);
-    mshadow::Tensor<xpu, 2> gwmat = out_grad[1].get<xpu, 2, real_t>(stream);
-    mshadow::Tensor<xpu, 1> gbias = out_grad[2].get<xpu, 1, real_t>(stream);
+    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
+    Tensor<xpu, 2> data = in_data[0].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> wmat = in_data[1].get<xpu, 2, real_t>(s);
+    Tensor<xpu, 2> grad = grad_next[0].FlatTo2D<xpu, real_t>(s);
     //  backprop
-    CHECK(req[0] != kWriteInplace);
-    Assign(gwmat, mshadow::expr::dot(grad.T(), data), req[0]);
-    if (!param_.no_bias) {
-      Assign(gbias, mshadow::expr::sum_rows(grad), req[1]);
-    }
-    if (req[0] != kNullOp) {
-      CHECK(req[0] != kWriteInplace);
-      Assign(gdata, mshadow::expr::dot(grad, wmat), req[2]);
+    CHECK(req[1] != kWriteInplace) << "cannot write weight inplace";
+    // gradient of weight
+    Tensor<xpu, 2> gwmat = out_grad[1].get<xpu, 2, real_t>(s);
+    Assign(gwmat, req[1], dot(grad.T(), data));
+    // gradient of bias
+    if (param_.no_bias == 0) {
+      Tensor<xpu, 1> gbias = out_grad[2].get<xpu, 1, real_t>(s);
+      Assign(gbias, req[2], sum_rows(grad));
     }
+    // gradient of data
+    Tensor<xpu, 2> gdata = out_grad[0].FlatTo2D<xpu, real_t>(s);
+    Assign(gdata, req[0], dot(grad, wmat));
   }
  private:
   Param param_;
@@ -106,6 +104,5 @@ class FullyConnectOp : public Operator {
 }  // namespace op
 }  // namespace mxnet
 
-#endif  // MXNET_FULLY_CONNECT_OP_INL_HPP
-
+#endif  // MXNET_OPERATOR_FULLY_CONNECT_OP_INL_H_
 
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index ac035b395b21..7c2f0c7b6a76 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -1,13 +1,12 @@
 /*!
  * Copyright (c) 2015 by Contributors
- * \file op.h
+ * \file mshadow_op.h
  * \brief extra mshadow operation for mxnet
  * \author Bing Xu
  */
 #ifndef MXNET_MSHADOW_OPERATOR_OP_H_
 #define MXNET_MSHADOW_OPERATOR_OP_H_
-#pragma once
-
+#include <mxnet/base.h>
 #include <algorithm>
 
 namespace mxnet {
@@ -105,5 +104,3 @@ struct square_root {
 
 #endif  // MXNET_MSHADOW_OPERATOR_OP_H_
 
-
-
diff --git a/src/operator/operator-inl.h b/src/operator/operator-inl.h
new file mode 100644
index 000000000000..7bdd0a1b96d1
--- /dev/null
+++ b/src/operator/operator-inl.h
@@ -0,0 +1,35 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file operator-inl.h
+ * \brief device invarient code to create operators
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_INL_H_
+#define MXNET_OPERATOR_INL_H_
+#include <mxnet/base.h>
+#include <dmlc/logging.h>
+#include "./mshadow_op.h"
+#include "./activation_op-inl.h"
+#include "./fully_connect_op-inl.h"
+
+namespace mxnet {
+namespace op {
+/*!
+ * \brief device invariant function to create operators 
+ * \param type the type of operator
+ * \tparam xpu the device type we are at
+ */
+template<typename xpu>
+inline Operator *CreateOperator_(OpType type) {
+  switch (type) {
+    case kReLU:
+      return new ActivationOp<xpu, relu, relu_grad>();
+    case kFullc:
+      return new FullyConnectOp<xpu>();
+    default: LOG(FATAL) << "unknown OpType";
+  }
+  return NULL;
+}
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_INL_H_
diff --git a/src/operator/operator.cc b/src/operator/operator.cc
index 1cab17a62a67..e56d6049eca9 100644
--- a/src/operator/operator.cc
+++ b/src/operator/operator.cc
@@ -1,18 +1,39 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file operator.cc
- * \brief
- * \author Bing Xu
-*/
-#include "operator_helper.h"
+#include <cstring>
+#include <dmlc/logging.h>
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include "./operator_common.h"
 
 namespace mxnet {
 namespace op {
+// declare the operator
+template<typename xpu>
+Operator *CreateOperator(OpType type);
 
-Operator *CreateOperator(OpType type) {
-  return OperatorFactory<cpu>(type);
+
+OpType GetOpTpe(const char *type) {
+  if (!strcmp(type, "relu")) return kReLU;
+  if (!strcmp(type, "fullc")) return kFullc;
+  LOG(FATAL) << "unknown operator type " << type;
+  return kReLU;
+}
 }
 
-}  // namespace op
-}  // namespace mxnet
+// implementing the context
+Operator *Operator::Create(const char *type,
+                          Context ctx) {
+  op::OpType otype = op::GetOpTpe(type);
+  if (ctx.dev_mask == cpu::kDevMask) {
+    return op::CreateOperator<cpu>(otype);
+  }
+  if (ctx.dev_mask == gpu::kDevMask) {
+#if MXNET_USE_CUDA
+    return op::CreateOperator<gpu>(otype);
+#else
+    LOG(FATAL) << "GPU is not enabled";
+#endif
+  }
+  return NULL;
+}
 
+}  // namespace mxnet
diff --git a/src/operator/operator.cu b/src/operator/operator.cu
deleted file mode 100644
index 3cc1ada28e4b..000000000000
--- a/src/operator/operator.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file operator.cu
- * \brief
- * \author Bing Xu
-*/
-
-
-#include "operator_helper.h"
-
-namespace mxnet {
-namespace op {
-
-Operator * CreateOperator(OpType type) {
-  return OperatorFactory<gpu>(type);
-}
-
-} // namespace op
-} // namespace mxnet
-
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
new file mode 100644
index 000000000000..8fb1066333b3
--- /dev/null
+++ b/src/operator/operator_common.h
@@ -0,0 +1,67 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file operator_common.h
+ * \brief common internal header of most operators
+ *   this header includes utility functions operator can use
+ *   common type definitions
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_OPERATOR_COMMON_H_
+#define MXNET_OPERATOR_OPERATOR_COMMON_H_
+
+#include <dmlc/logging.h>
+#include <mxnet/operator.h>
+
+namespace mxnet {
+namespace op {
+/*!
+ * \brief assign the expression to out according to request 
+ * \param out the data to be assigned
+ * \param req the assignment request
+ * \param exp the expression
+ * \tparam OType output type
+ * \tparam Exp expression type
+ */
+template<typename OType, typename Exp>
+inline void Assign(OType &out,
+                   Operator::GradReqType req,
+                   const Exp &exp) {
+  switch (req) {
+    case Operator::kNullOp: break;
+    case Operator::kWriteTo:
+    case Operator::kWriteInplace: out = exp; break;
+    case Operator::kAddTo: out += exp; break;
+    default: LOG(FATAL) << "not reached";
+  }
+}
+/*!
+ * \brief assign shape to out if out is unknown
+ *  otherwise check consistency
+ * \param out the output shape to be stored
+ * \param shape the infered shape
+ */
+template<typename TS>
+inline void ShapeAssignCheck(TShape &out, const TS &shape) {
+  if (out.ndim() == 0) {
+    out = shape;
+  } else {
+    CHECK(out == shape) << "InferShape:: shape inconsistent";
+  }
+}
+
+/*! \brief type of operators */
+enum OpType {
+  kReLU = 0,
+  kFullc = 1
+};
+
+/*!
+ * \brief device invariant function to create operators 
+ * \param type the type of operator
+ * \tparam xpu the device type we are at
+ */
+template<typename xpu>
+Operator *CreateOperator(OpType type);
+}  //namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_COMMON_H_
diff --git a/src/operator/operator_cpu.cc b/src/operator/operator_cpu.cc
new file mode 100644
index 000000000000..3d5e7c5f3248
--- /dev/null
+++ b/src/operator/operator_cpu.cc
@@ -0,0 +1,18 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file operator_cpu.cc
+ * \brief CPU specialization of operator codes
+ * \author Bing Xu
+*/
+#include "./operator-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOperator<cpu>(OpType type) {  
+  return CreateOperator_<cpu>(type);
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/operator_gpu.cu b/src/operator/operator_gpu.cu
new file mode 100644
index 000000000000..8fb3b2751f13
--- /dev/null
+++ b/src/operator/operator_gpu.cu
@@ -0,0 +1,21 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file operator_gpu.cu
+ * \brief GPU specialization of operator code
+ * \author Bing Xu
+*/
+#include <mxnet/base.h>
+#include <mxnet/tensor_blob.h>
+#include "operator-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOperator<gpu>(OpType type) {  
+  return CreateOperator_<gpu>(type);
+}
+
+} // namespace op
+} // namespace mxnet
+
diff --git a/src/operator/operator_helper.h b/src/operator/operator_helper.h
deleted file mode 100644
index 9a673b12cedb..000000000000
--- a/src/operator/operator_helper.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file assign_helper.h
- * \brief
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_HELPER_H_
-#define MXNET_OPERATOR_HELPER_H_
-#include "activation_op-inl.h"
-#include "fully_connect_op-inl.h"
-#include "mshadow_op.h"
-
-namespace mxnet {
-namespace op {
-
-enum OpType {
-  kReLU = 0,
-  kFullc = 1,
-};
-
-template<typename xpu>
-Operator *OperatorFactory(OpType type) {
-  switch (type) {
-    case kReLU:
-      return new ActivationOp<xpu, relu, relu_grad>();
-    case kFullc:
-      return new FullyConnectOp<xpu>();
-
-  };
-  return NULL;
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_HELPER_H_
diff --git a/src/operator/param.h b/src/operator/param.h
index c17555898c0a..0d8016983c5a 100644
--- a/src/operator/param.h
+++ b/src/operator/param.h
@@ -6,10 +6,10 @@
 */
 #ifndef MXNET_OPERATOR_PARAM_H_
 #define MXNET_OPERATOR_PARAM_H_
-#pragma once
 
 namespace mxnet {
 namespace op {
+/*! \brief possible parameter for each operator */
 struct Param {
   /*! \brief number of hidden layers */
   int num_hidden;

From 9c214c844720ed164b19b99b974c7cd7c7747f6b Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sun, 21 Jun 2015 12:07:42 -0600
Subject: [PATCH 10/12] minor in act

---
 include/mxnet/operator.h         | 7 ++++---
 src/operator/activation_op-inl.h | 5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index c0179990058d..0dd6eb935b82 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -69,7 +69,7 @@ class Operator {
    *     in_shape allows unknown elements, which are checked by shape.ndim() == 0.
    *     For unknown shapes, InferShape will try to fill in the correct Shape in in_shape
    *     For known shapes, InferShape will check shape consistency
-   *  
+   *
    *     common practice: set the shape of data input, and usually weight's shape can be infered
    *
    * \param out_shape the shape of outputs of the operator
@@ -81,7 +81,7 @@ class Operator {
    * \brief perform a forward operation of operator, save the output to TBlob
    * \param opt option on Forward such as whether this is training phase
    * \param ctx runtime context
-   * \param in_data array of input data
+   * \param in_data array of input data, it is const
    * \param out_data array of output data,
    *        the space of TBlob in out_data must be pre-allocated with InferShape
    */
@@ -97,6 +97,7 @@ class Operator {
    * \param out_grad array of output gradient, there could be three possible TBlob
    *  in the each element in the array
    * \param req_types request types of the gradient saving operation
+   *                  only inplace will change input data
    * \sa GradReqType
    */
   virtual void Backward(RunContext ctx,
@@ -104,7 +105,7 @@ class Operator {
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<GradReqType> &req);
-  
+
   /*!
    * \brief factory unction, create a new operator
    * \param type the type of operator
diff --git a/src/operator/activation_op-inl.h b/src/operator/activation_op-inl.h
index 0e87020ea1c6..e02da0109aef 100644
--- a/src/operator/activation_op-inl.h
+++ b/src/operator/activation_op-inl.h
@@ -50,8 +50,9 @@ class ActivationOp : public Operator {
       static_cast<mshadow::Stream<xpu> *>(ctx.stream);
     mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D<xpu, real_t>(stream);
     mshadow::Tensor<xpu, 2> data = in_data[0].FlatTo2D<xpu, real_t>(stream);
-    mshadow::Tensor<xpu, 2> out = in_data[0].FlatTo2D<xpu, real_t>(stream);
-    Assign(out, req[0], mshadow::expr::F<BackOp>(data) * grad);
+    mshadow::Tensor<xpu, 2> out = out_grad[0].FlatTo2D<xpu, real_t>(stream);
+    Assign(out, req[0], mshadow::expr::F<BackOp>(
+        mshadow::expr::F<ForwardOp>(data)) * grad);
   }
 };  // class ActivationOp
 }  // namespace op

From 7a1296d5273cb7d013ead7e615ce0ffa552958ad Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sun, 21 Jun 2015 12:12:30 -0600
Subject: [PATCH 11/12] remove inplace check

---
 src/operator/activation_op-inl.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/activation_op-inl.h b/src/operator/activation_op-inl.h
index e02da0109aef..2a412ef3b2e1 100644
--- a/src/operator/activation_op-inl.h
+++ b/src/operator/activation_op-inl.h
@@ -45,7 +45,6 @@ class ActivationOp : public Operator {
     CHECK(in_data.size() == 1);
     CHECK(out_grad.size() == 1);
     CHECK(req.size() == 1);
-    CHECK(req[0] == kWriteInplace);
     mshadow::Stream<xpu> *stream = \
       static_cast<mshadow::Stream<xpu> *>(ctx.stream);
     mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D<xpu, real_t>(stream);

From 4d175e439e03634e1c8b4847ec39171abd468a82 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sun, 21 Jun 2015 12:59:49 -0600
Subject: [PATCH 12/12] add property

---
 include/mxnet/operator.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 0dd6eb935b82..a9b3c9f2b3ae 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -47,6 +47,12 @@ class Operator {
     /*! \brief bias argument */
     kBiasArg = 2
   };
+  enum Property {
+    /*! \brief Op contains interanl state, won't influence engine schedule */
+    kContainInteralState = 1,
+    /*! \brief Op forward require random number, will influence engine schedule */
+    kForwardRequireRnd = 2,
+  };
   /*!
    * \brief get types of input argument of this oeprator
    * \return a vector corresponding to type of each argument
@@ -56,6 +62,14 @@ class Operator {
     // default most of layers only have one data argument
     return std::vector<ArgType>(1, kDataArg);
   }
+  /*!
+   * \brief describe property of op
+   * \return a bit map in int
+   */
+  virtual int DescribeProperty() const {
+    // default most of layer only conatin internal state
+    return kContainInteralState;
+  }
   /*!
    * \brief set param for the operator from string
    * \param name parameter name