diff --git a/src/operator/smooth_l1_unary-inl.h b/src/operator/smooth_l1_unary-inl.h
index 2b81c765f62e..0094abba0013 100644
--- a/src/operator/smooth_l1_unary-inl.h
+++ b/src/operator/smooth_l1_unary-inl.h
@@ -71,12 +71,22 @@ void SmoothL1Forward_(const TBlob& src,
   CHECK_EQ(ret->type_flag_, src.type_flag_)
     << "Unary function only support input/output with the same type";
   real_t sigma2 = env.scalar * env.scalar;
-  MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> out = ret->get<xpu, 2, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> in = src.get<xpu, 2, DType>(s);
-    ASSIGN_DISPATCH(out, req,
-                    F<mshadow_op::smooth_l1_loss>(in, ScalarExp<DType>(sigma2)));
-  });
+  const int ndim = ret[0].shape_.ndim();
+  if (ndim == 4) {
+    MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+      mshadow::Tensor<xpu, 4, DType> out = ret->get<xpu, 4, DType>(s);
+      mshadow::Tensor<xpu, 4, DType> in = src.get<xpu, 4, DType>(s);
+      ASSIGN_DISPATCH(out, req,
+                      F<mshadow_op::smooth_l1_loss>(in, ScalarExp<DType>(sigma2)));
+    });
+  } else if (ndim == 2) {
+    MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+      mshadow::Tensor<xpu, 2, DType> out = ret->get<xpu, 2, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> in = src.get<xpu, 2, DType>(s);
+      ASSIGN_DISPATCH(out, req,
+                      F<mshadow_op::smooth_l1_loss>(in, ScalarExp<DType>(sigma2)));
+    });
+  }
 }
 
 template<typename xpu>
@@ -94,13 +104,24 @@ void SmoothL1BackwardUseIn_(const OutputGrad& out_grad,
   CHECK_EQ(in_grad->type_flag_, in_data0.data.type_flag_)
     << "Unary function only support input/output with the same type";
   real_t sigma2 = env.scalar * env.scalar;
-  MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> src = in_data0.data.get<xpu, 2, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> ograd = out_grad.data.get<xpu, 2, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> igrad = in_grad->get<xpu, 2, DType>(s);
-    ASSIGN_DISPATCH(igrad, req,
-                    ograd * F<mshadow_op::smooth_l1_gradient>(src, ScalarExp<DType>(sigma2)));
-  });
+  const int ndim = in_grad[0].shape_.ndim();
+  if (ndim == 4) {
+    MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+      mshadow::Tensor<xpu, 4, DType> src = in_data0.data.get<xpu, 4, DType>(s);
+      mshadow::Tensor<xpu, 4, DType> ograd = out_grad.data.get<xpu, 4, DType>(s);
+      mshadow::Tensor<xpu, 4, DType> igrad = in_grad->get<xpu, 4, DType>(s);
+      ASSIGN_DISPATCH(igrad, req,
+                      ograd * F<mshadow_op::smooth_l1_gradient>(src, ScalarExp<DType>(sigma2)));
+    });
+  } else if (ndim == 2) {
+    MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+      mshadow::Tensor<xpu, 2, DType> src = in_data0.data.get<xpu, 2, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> ograd = out_grad.data.get<xpu, 2, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> igrad = in_grad->get<xpu, 2, DType>(s);
+      ASSIGN_DISPATCH(igrad, req,
+                      ograd * F<mshadow_op::smooth_l1_gradient>(src, ScalarExp<DType>(sigma2)));
+    });
+  }
 }
 
 MXNET_REGISTER_SIMPLE_OP(smooth_l1, XPU)
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 2d1d8f6d12b6..6bac3049b2a9 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -30,6 +30,7 @@ struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
   float ignore_label;
   bool multi_output;
   bool use_ignore;
+  bool is_hidden_layer;
   DMLC_DECLARE_PARAMETER(SoftmaxOutputParam) {
     DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f)
     .describe("Scale the gradient by a float factor");
@@ -43,6 +44,8 @@ struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
     DMLC_DECLARE_FIELD(use_ignore).set_default(false)
     .describe("If set to true, the ignore_label value will not contribute "
       "to the backward gradient");
+    DMLC_DECLARE_FIELD(is_hidden_layer).set_default(false)
+    .describe("If set to true, out_grad is needed in backward");
   };
 };
 
@@ -105,7 +108,15 @@ class SoftmaxOutputOp : public Operator {
       } else {
           SoftmaxGrad(grad, out, label);
       }
-      grad *= DType(param_.grad_scale/s3[2]);
+      if (!param_.is_hidden_layer) {
+        grad *= DType(param_.grad_scale/s3[2]);
+      }
+      else {
+        Tensor<xpu, 3, DType> o_grad =
+          out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+        grad *= DType(param_.grad_scale);
+        grad *= o_grad;
+      }
     } else {
       const TShape& label_shape = in_data[softmaxout_enum::kLabel].shape_;
       Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 1, DType>(
@@ -199,9 +210,13 @@ class SoftmaxOutputProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut]};
+    if (param_.is_hidden_layer) {
+      return {out_grad[softmaxout_enum::kOut], in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut]};
+    }
+    else {
+      return {in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut]};
+    }
   }
-
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,