From 88e8861ef198432b811d9eac1e2ceccf4842ec39 Mon Sep 17 00:00:00 2001
From: Seanlinx <515364970@qq.com>
Date: Sat, 21 May 2016 21:26:56 +0800
Subject: [PATCH 1/2] modify smooth_l1 and softmax_output

---
 src/operator/smooth_l1_unary-inl.h | 10 +++++-----
 src/operator/softmax_output-inl.h  |  8 ++++++--
 2 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/src/operator/smooth_l1_unary-inl.h b/src/operator/smooth_l1_unary-inl.h
index 2b81c765f62e..8acdf026d0eb 100644
--- a/src/operator/smooth_l1_unary-inl.h
+++ b/src/operator/smooth_l1_unary-inl.h
@@ -72,8 +72,8 @@ void SmoothL1Forward_(const TBlob& src,
     << "Unary function only support input/output with the same type";
   real_t sigma2 = env.scalar * env.scalar;
   MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> out = ret->get<xpu, 2, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> in = src.get<xpu, 2, DType>(s);
+    mshadow::Tensor<xpu, 4, DType> out = ret->get<xpu, 4, DType>(s);
+    mshadow::Tensor<xpu, 4, DType> in = src.get<xpu, 4, DType>(s);
     ASSIGN_DISPATCH(out, req,
                     F<mshadow_op::smooth_l1_loss>(in, ScalarExp<DType>(sigma2)));
   });
@@ -95,9 +95,9 @@ void SmoothL1BackwardUseIn_(const OutputGrad& out_grad,
     << "Unary function only support input/output with the same type";
   real_t sigma2 = env.scalar * env.scalar;
   MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> src = in_data0.data.get<xpu, 2, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> ograd = out_grad.data.get<xpu, 2, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> igrad = in_grad->get<xpu, 2, DType>(s);
+    mshadow::Tensor<xpu, 4, DType> src = in_data0.data.get<xpu, 4, DType>(s);
+    mshadow::Tensor<xpu, 4, DType> ograd = out_grad.data.get<xpu, 4, DType>(s);
+    mshadow::Tensor<xpu, 4, DType> igrad = in_grad->get<xpu, 4, DType>(s);
     ASSIGN_DISPATCH(igrad, req,
                     ograd * F<mshadow_op::smooth_l1_gradient>(src, ScalarExp<DType>(sigma2)));
   });
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 2d1d8f6d12b6..141b54e35659 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -98,6 +98,8 @@ class SoftmaxOutputOp : public Operator {
       Tensor<xpu, 2, DType> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, DType>(s);
       Tensor<xpu, 3, DType> out =
           out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+      Tensor<xpu, 3, DType> o_grad =
+          out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
       Tensor<xpu, 3, DType> grad =
           in_grad[softmaxout_enum::kData].get_with_shape<xpu, 3, DType>(s3, s);
       if (param_.use_ignore) {
@@ -105,7 +107,9 @@ class SoftmaxOutputOp : public Operator {
       } else {
           SoftmaxGrad(grad, out, label);
       }
-      grad *= DType(param_.grad_scale/s3[2]);
+//     grad *= DType(param_.grad_scale/s3[2]);
+      grad *= DType(param_.grad_scale);
+      grad *= o_grad;
     } else {
       const TShape& label_shape = in_data[softmaxout_enum::kLabel].shape_;
       Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 1, DType>(
@@ -199,7 +203,7 @@ class SoftmaxOutputProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut]};
+    return {out_grad[softmaxout_enum::kOut], in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(

From ba52895a38525baaf5393931add2a2fe09edbbe9 Mon Sep 17 00:00:00 2001
From: Seanlinx <515364970@qq.com>
Date: Sun, 22 May 2016 00:06:05 +0800
Subject: [PATCH 2/2] modify softmax_output and smooth_l1 operator

---
 src/operator/smooth_l1_unary-inl.h | 47 +++++++++++++++++++++---------
 src/operator/softmax_output-inl.h  | 25 +++++++++++-----
 2 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/src/operator/smooth_l1_unary-inl.h b/src/operator/smooth_l1_unary-inl.h
index 8acdf026d0eb..0094abba0013 100644
--- a/src/operator/smooth_l1_unary-inl.h
+++ b/src/operator/smooth_l1_unary-inl.h
@@ -71,12 +71,22 @@ void SmoothL1Forward_(const TBlob& src,
   CHECK_EQ(ret->type_flag_, src.type_flag_)
     << "Unary function only support input/output with the same type";
   real_t sigma2 = env.scalar * env.scalar;
-  MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-    mshadow::Tensor<xpu, 4, DType> out = ret->get<xpu, 4, DType>(s);
-    mshadow::Tensor<xpu, 4, DType> in = src.get<xpu, 4, DType>(s);
-    ASSIGN_DISPATCH(out, req,
-                    F<mshadow_op::smooth_l1_loss>(in, ScalarExp<DType>(sigma2)));
-  });
+  const int ndim = ret[0].shape_.ndim();
+  if (ndim == 4) {
+    MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+      mshadow::Tensor<xpu, 4, DType> out = ret->get<xpu, 4, DType>(s);
+      mshadow::Tensor<xpu, 4, DType> in = src.get<xpu, 4, DType>(s);
+      ASSIGN_DISPATCH(out, req,
+                      F<mshadow_op::smooth_l1_loss>(in, ScalarExp<DType>(sigma2)));
+    });
+  } else if (ndim == 2) {
+    MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+      mshadow::Tensor<xpu, 2, DType> out = ret->get<xpu, 2, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> in = src.get<xpu, 2, DType>(s);
+      ASSIGN_DISPATCH(out, req,
+                      F<mshadow_op::smooth_l1_loss>(in, ScalarExp<DType>(sigma2)));
+    });
+  }
 }
 
 template<typename xpu>
@@ -94,13 +104,24 @@ void SmoothL1BackwardUseIn_(const OutputGrad& out_grad,
   CHECK_EQ(in_grad->type_flag_, in_data0.data.type_flag_)
     << "Unary function only support input/output with the same type";
   real_t sigma2 = env.scalar * env.scalar;
-  MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
-    mshadow::Tensor<xpu, 4, DType> src = in_data0.data.get<xpu, 4, DType>(s);
-    mshadow::Tensor<xpu, 4, DType> ograd = out_grad.data.get<xpu, 4, DType>(s);
-    mshadow::Tensor<xpu, 4, DType> igrad = in_grad->get<xpu, 4, DType>(s);
-    ASSIGN_DISPATCH(igrad, req,
-                    ograd * F<mshadow_op::smooth_l1_gradient>(src, ScalarExp<DType>(sigma2)));
-  });
+  const int ndim = in_grad[0].shape_.ndim();
+  if (ndim == 4) {
+    MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+      mshadow::Tensor<xpu, 4, DType> src = in_data0.data.get<xpu, 4, DType>(s);
+      mshadow::Tensor<xpu, 4, DType> ograd = out_grad.data.get<xpu, 4, DType>(s);
+      mshadow::Tensor<xpu, 4, DType> igrad = in_grad->get<xpu, 4, DType>(s);
+      ASSIGN_DISPATCH(igrad, req,
+                      ograd * F<mshadow_op::smooth_l1_gradient>(src, ScalarExp<DType>(sigma2)));
+    });
+  } else if (ndim == 2) {
+    MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+      mshadow::Tensor<xpu, 2, DType> src = in_data0.data.get<xpu, 2, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> ograd = out_grad.data.get<xpu, 2, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> igrad = in_grad->get<xpu, 2, DType>(s);
+      ASSIGN_DISPATCH(igrad, req,
+                      ograd * F<mshadow_op::smooth_l1_gradient>(src, ScalarExp<DType>(sigma2)));
+    });
+  }
 }
 
 MXNET_REGISTER_SIMPLE_OP(smooth_l1, XPU)
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 141b54e35659..6bac3049b2a9 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -30,6 +30,7 @@ struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
   float ignore_label;
   bool multi_output;
   bool use_ignore;
+  bool is_hidden_layer;
   DMLC_DECLARE_PARAMETER(SoftmaxOutputParam) {
     DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f)
     .describe("Scale the gradient by a float factor");
@@ -43,6 +44,8 @@ struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
     DMLC_DECLARE_FIELD(use_ignore).set_default(false)
     .describe("If set to true, the ignore_label value will not contribute "
       "to the backward gradient");
+    DMLC_DECLARE_FIELD(is_hidden_layer).set_default(false)
+    .describe("If set to true, out_grad is needed in backward");
   };
 };
 
@@ -98,8 +101,6 @@ class SoftmaxOutputOp : public Operator {
       Tensor<xpu, 2, DType> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, DType>(s);
       Tensor<xpu, 3, DType> out =
           out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
-      Tensor<xpu, 3, DType> o_grad =
-          out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
       Tensor<xpu, 3, DType> grad =
           in_grad[softmaxout_enum::kData].get_with_shape<xpu, 3, DType>(s3, s);
       if (param_.use_ignore) {
@@ -107,9 +108,15 @@ class SoftmaxOutputOp : public Operator {
       } else {
           SoftmaxGrad(grad, out, label);
       }
-//     grad *= DType(param_.grad_scale/s3[2]);
-      grad *= DType(param_.grad_scale);
-      grad *= o_grad;
+      if (!param_.is_hidden_layer) {
+        grad *= DType(param_.grad_scale/s3[2]);
+      }
+      else {
+        Tensor<xpu, 3, DType> o_grad =
+          out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+        grad *= DType(param_.grad_scale);
+        grad *= o_grad;
+      }
     } else {
       const TShape& label_shape = in_data[softmaxout_enum::kLabel].shape_;
       Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 1, DType>(
@@ -203,9 +210,13 @@ class SoftmaxOutputProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[softmaxout_enum::kOut], in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut]};
+    if (param_.is_hidden_layer) {
+      return {out_grad[softmaxout_enum::kOut], in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut]};
+    }
+    else {
+      return {in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut]};
+    }
   }
-
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,