diff --git a/src/common/cuda_utils.cc b/src/common/cuda_utils.cc
new file mode 100644
index 000000000000..f38b2f8b5490
--- /dev/null
+++ b/src/common/cuda_utils.cc
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file cuda_utils.cc
+ * \brief Common CUDA utilities.
+ */
+
+#include <mxnet/base.h>
+#include <mshadow/base.h>
+
+#include <algorithm>
+
+#include "cuda_utils.h"
+
+#if MXNET_USE_CUDA
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+
+namespace {
+  bool IsPower2(size_t N) {
+    return ((N & (N - 1)) == 0) && N != 0;
+  }
+
+  size_t RoundToPower2(size_t N) {
+    size_t ret = 1;
+    size_t copyN = N;
+    while (N >= 2) {
+      ret *= 2;
+      N /= 2;
+    }
+    if (ret < copyN) {
+      ret *= 2;
+    }
+    return ret;
+  }
+}  // namespace
+
+int get_load_type(size_t N) {
+  using namespace mshadow;
+  if (N % 8 == 0) {
+    return kFloat64;
+  } else if (N % 4 == 0) {
+    return kFloat32;
+  } else if (N % 2 == 0) {
+    return kFloat16;
+  } else {
+    return kUint8;
+  }
+}
+
+int get_rows_per_block(size_t row_size, int num_threads_per_block) {
+  const int warp_size = 32;
+  CHECK(IsPower2(num_threads_per_block))
+    << "Number of threads in a block must be power of 2 to use get_rows_per_block function";
+  // How many read instructions should 1 thread at least do
+  const int read_instructions = 2;
+  const int desired_num_threads_per_row = (row_size + read_instructions - 1) / read_instructions;
+  int desired_num_warps_per_row = (desired_num_threads_per_row + warp_size - 1) / warp_size;
+  int actual_num_warps_per_row = std::min(desired_num_warps_per_row,
+                                          num_threads_per_block / warp_size);
+  // actual number of warps needs to be power of 2
+  actual_num_warps_per_row = RoundToPower2(desired_num_warps_per_row);
+  return num_threads_per_block / (warp_size * actual_num_warps_per_row);
+}
+
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index 3bb6b81a6de2..dd98a0e6c966 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -20,7 +20,7 @@
 /*!
  * Copyright (c) 2015 by Contributors
  * \file cuda_utils.h
- * \brief CUDA debugging utilities.
+ * \brief Common CUDA utilities.
  */
 #ifndef MXNET_COMMON_CUDA_UTILS_H_
 #define MXNET_COMMON_CUDA_UTILS_H_
@@ -327,6 +327,28 @@ class DeviceStore {
   bool restore_;
 };
 
+/*!
+ * \brief Get the largest datatype suitable to read
+ *         requested number of bytes.
+ *
+ *  \input Number of bytes to be read
+ *  \return mshadow representation of type that could
+ *          be used for reading
+ */
+int get_load_type(size_t N);
+
+/*!
+ * \brief Determine how many rows in a 2D matrix should a block
+ *        of threads handle based on the row size and the number
+ *        of threads in a block.
+ * \param row_size Size of the row expressed in the number of reads required to fully
+ *                 load it. For example, if the row has N elements, but  each thread
+ *                 reads 2 elements with a single read, row_size should be N / 2.
+ * \param num_threads_per_block Number of threads in a block.
+ * \return the number of rows that should be handled by a single block.
+ */
+int get_rows_per_block(size_t row_size, int num_threads_per_block);
+
 }  // namespace cuda
 }  // namespace common
 }  // namespace mxnet
@@ -542,7 +564,7 @@ static inline  __device__  void atomicAdd(double *address, double val) {
 // Overload atomicAdd for half precision
 // Taken from:
 // https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
-#if defined(__CUDA_ARCH__)
+#ifdef __CUDACC__
 static inline __device__ void atomicAdd(mshadow::half::half_t *address,
                                         mshadow::half::half_t val) {
   unsigned int *address_as_ui =
@@ -607,6 +629,28 @@ __device__ inline DType ldg(const DType* address) {
     return *address;
 #endif
 }
-#endif
+
+template <typename OP, typename T>
+__device__ inline T warp_reduce(T value, OP redfun) {
+  value = redfun(value, __shfl_down_sync(0xffffffff, value, 16));
+  value = redfun(value, __shfl_down_sync(0xffffffff, value, 8));
+  value = redfun(value, __shfl_down_sync(0xffffffff, value, 4));
+  value = redfun(value, __shfl_down_sync(0xffffffff, value, 2));
+  value = redfun(value, __shfl_down_sync(0xffffffff, value, 1));
+  return value;
+}
+
+template <typename OP>
+__device__ inline mshadow::half::half_t warp_reduce(mshadow::half::half_t value, OP redfun) {
+  float v = static_cast<float>(value);
+  v = redfun(v, __shfl_down_sync(0xffffffff, v, 16));
+  v = redfun(v, __shfl_down_sync(0xffffffff, v, 8));
+  v = redfun(v, __shfl_down_sync(0xffffffff, v, 4));
+  v = redfun(v, __shfl_down_sync(0xffffffff, v, 2));
+  v = redfun(v, __shfl_down_sync(0xffffffff, v, 1));
+  return mshadow::half::half_t(v);
+}
+
+#endif  // __CUDACC__
 
 #endif  // MXNET_COMMON_CUDA_UTILS_H_
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 52788f697f11..ae98952bb7bf 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -414,6 +414,36 @@ inline int get_num_threads<cpu>(const int N) {
     LOG(FATAL) << "Unknown type enum " << type;            \
   }
 
+#define MXNET_LOAD_TYPE_SWITCH(type, DType, ...)           \
+  switch (type) {                                          \
+  case mshadow::kFloat32:                                  \
+    {                                                      \
+      typedef float DType;                                 \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kFloat64:                                  \
+    {                                                      \
+      typedef double DType;                                \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kFloat16:                                  \
+    {                                                      \
+      typedef mshadow::half::half_t DType;                 \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kUint8:                                    \
+    {                                                      \
+      typedef uint8_t DType;                               \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  default:                                                 \
+    LOG(FATAL) << "Invalid loading enum type " << type;    \
+  }
+
 /*!
  * \brief assign the val to out according
  * to request in Kernel::Launch
diff --git a/src/operator/nn/log_softmax.cc b/src/operator/nn/log_softmax.cc
new file mode 100644
index 000000000000..73db44805c8e
--- /dev/null
+++ b/src/operator/nn/log_softmax.cc
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file log_softmax.cc
+ * \brief CPU Implementation of log_softmax
+ */
+#include "./softmax-inl.h"
+#include "../tensor/elemwise_unary_op.h"
+#include "../tensor/elemwise_binary_op.h"
+#include "../operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(log_softmax)
+.add_alias("_npx_log_softmax")
+.describe(R"code(Computes the log softmax of the input.
+This is equivalent to computing softmax followed by log.
+
+Examples::
+
+  >>> x = mx.nd.array([1, 2, .1])
+  >>> mx.nd.log_softmax(x).asnumpy()
+  array([-1.41702998, -0.41702995, -2.31702995], dtype=float32)
+
+  >>> x = mx.nd.array( [[1, 2, .1],[.1, 2, 1]] )
+  >>> mx.nd.log_softmax(x, axis=0).asnumpy()
+  array([[-0.34115392, -0.69314718, -1.24115396],
+         [-1.24115396, -0.69314718, -0.34115392]], dtype=float32)
+
+
+)code")
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::log_softmax_fwd>)
+.set_attr<nnvm::FGradient>("FGradient", SoftmaxFGradient{"_backward_log_softmax"})
+.set_attr<nnvm::FInferType>("FInferType", SoftmaxOpType)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("data", "NDArray-or-Symbol", "The input array.")
+.add_arguments(SoftmaxParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_log_softmax)
+.set_num_inputs(SoftmaxGradOpNumInputs)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", SoftmaxGradOpInputNames)
+.set_attr<mxnet::FInferShape>("FInferShape", SoftmaxGradOpShape)
+.set_attr<nnvm::FInferType>("FInferType", SoftmaxGradOpType)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", SoftmaxGradOpInplaceOption)
+.add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments")
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, mshadow_op::left,
+                                                        mxnet_op::log_softmax_bwd>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nn/log_softmax.cu b/src/operator/nn/log_softmax.cu
new file mode 100644
index 000000000000..8bff2777f7b9
--- /dev/null
+++ b/src/operator/nn/log_softmax.cu
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file log_softmax.cu
+ * \brief GPU Implementation of log_softmax
+ */
+#include "./softmax-inl.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(log_softmax)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxCompute<gpu, mxnet_op::log_softmax_fwd>);
+
+NNVM_REGISTER_OP(_backward_log_softmax)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxGradCompute<gpu, mshadow_op::left,
+                                                        mxnet_op::log_softmax_bwd>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index 2c82d839e5ed..73e0ccce4328 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -29,10 +29,12 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <type_traits>
 
 #include "../mxnet_op.h"
 #include "../operator_common.h"
 #include "../tensor/broadcast_reduce_op.h"
+#include "../../common/cuda_utils.h"
 
 namespace mxnet {
 namespace op {
@@ -64,8 +66,9 @@ struct log_softmax_fwd {
 };
 
 
-template<typename OP, bool negate, typename AType, typename DType, typename OType, int ndim>
-inline void Softmax(Stream<cpu> *s, DType *in, OType *out,
+template<typename OP, bool negate, typename AType, typename DType, typename OType,
+         typename IType, int ndim>
+inline void Softmax(Stream<cpu> *s, DType *in, OType *out, IType *length,
                     Shape<ndim> shape, int axis, const DType temperature) {
   index_t M = shape[axis];
   index_t N = shape.Size()/M;
@@ -74,100 +77,89 @@ inline void Softmax(Stream<cpu> *s, DType *in, OType *out,
   sshape[axis] = 1;
   index_t sa = stride[axis];
 
-  #pragma omp parallel for
-  for (index_t i = 0; i < N; ++i) {
-    index_t base = unravel_dot(i, sshape, stride);
-
-    DType mmax = negate ? -in[base] : in[base];
-    DType val;
-    for (index_t j = 1; j < M; ++j) {
-      val = negate ? -in[base + j*sa] : in[base + j*sa];
-      if (mmax < val) mmax = val;
-    }
-
-    AType sum = AType(0);
-    DType in_val;
-    // By default temperature is 1.0.
-    // Adding a branch here to save the CPU 'divide-by-1' computation at runtime
-    if (temperature == 1.0) {
-      for (index_t j = 0; j < M; ++j) {
-        in_val = negate ? -in[base + j*sa] : in[base + j*sa];
-        sum += std::exp(in_val - mmax);
-      }
-
-      for (index_t j = 0; j < M; ++j) {
-        in_val = negate ? -in[base + j*sa] : in[base + j*sa];
-        out[base + j*sa] = OP::Map(in_val - mmax, sum);
-      }
-    } else {
-      for (index_t j = 0; j < M; ++j) {
-        in_val = negate ? -in[base + j*sa] : in[base + j*sa];
-        sum += std::exp((in_val - mmax)/temperature);
-      }
+  if (length == nullptr) {
+    #pragma omp parallel for
+    for (index_t i = 0; i < N; ++i) {
+      index_t base = unravel_dot(i, sshape, stride);
 
-      for (index_t j = 0; j < M; ++j) {
-        in_val = negate ? -in[base + j*sa] : in[base + j*sa];
-        out[base + j*sa] = OP::Map((in_val - mmax)/temperature, sum);
+      DType mmax = negate ? -in[base] : in[base];
+      DType val;
+      for (index_t j = 1; j < M; ++j) {
+        val = negate ? -in[base + j*sa] : in[base + j*sa];
+        if (mmax < val) mmax = val;
       }
-    }
-  }
-}
-
-template<typename OP, bool negate, typename AType, typename DType, typename OType,
-         typename IType, int ndim>
-inline void SoftmaxWithLength(Stream<cpu> *s, DType *in, OType *out, IType *length,
-                              Shape<ndim> shape, int axis, const DType temperature) {
-  index_t M = shape[axis];
-  index_t N = shape.Size()/M;
-  Shape<ndim> stride = calc_stride(shape);
-  Shape<ndim> sshape = shape;
-  sshape[axis] = 1;
-  index_t sa = stride[axis];
 
-  #pragma omp parallel for
-  for (index_t i = 0; i < N; ++i) {
-    index_t len = static_cast<index_t>(length[i]);
-    index_t base = unravel_dot(i, sshape, stride);
+      AType sum = AType(0);
+      DType in_val;
+      // By default temperature is 1.0.
+      // Adding a branch here to save the CPU 'divide-by-1' computation at runtime
+      if (temperature == 1.0) {
+        for (index_t j = 0; j < M; ++j) {
+          in_val = negate ? -in[base + j*sa] : in[base + j*sa];
+          sum += std::exp(in_val - mmax);
+        }
 
-    DType mmax = negate ? -in[base] : in[base];
-    DType val;
-    for (index_t j = 1; j < len; ++j) {
-      val = negate ? -in[base + j*sa] : in[base + j*sa];
-      if (mmax < val) mmax = val;
-    }
-    for (index_t j = len; j < M; ++j) {
-      out[base + j*sa] = OType(0.0f);
-    }
+        for (index_t j = 0; j < M; ++j) {
+          in_val = negate ? -in[base + j*sa] : in[base + j*sa];
+          out[base + j*sa] = OP::Map(in_val - mmax, sum);
+        }
+      } else {
+        for (index_t j = 0; j < M; ++j) {
+          in_val = negate ? -in[base + j*sa] : in[base + j*sa];
+          sum += std::exp((in_val - mmax)/temperature);
+        }
 
-    AType sum = AType(0);
-    DType in_val;
-    // By default temperature is 1.0.
-    // Adding a branch here to save the CPU 'divide-by-1' computation at runtime
-    if (temperature == 1.0) {
-      for (index_t j = 0; j < len; ++j) {
-        in_val = negate ? -in[base + j*sa] : in[base + j*sa];
-        sum += std::exp(in_val - mmax);
+        for (index_t j = 0; j < M; ++j) {
+          in_val = negate ? -in[base + j*sa] : in[base + j*sa];
+          out[base + j*sa] = OP::Map((in_val - mmax)/temperature, sum);
+        }
       }
-
-      for (index_t j = 0; j < len; ++j) {
-        in_val = negate ? -in[base + j*sa] : in[base + j*sa];
-        out[base + j*sa] = OP::Map(in_val - mmax, sum);
+    }
+  } else {
+    #pragma omp parallel for
+    for (index_t i = 0; i < N; ++i) {
+      index_t len = static_cast<index_t>(length[i]);
+      index_t base = unravel_dot(i, sshape, stride);
+
+      DType mmax = negate ? -in[base] : in[base];
+      DType val;
+      for (index_t j = 1; j < len; ++j) {
+        val = negate ? -in[base + j*sa] : in[base + j*sa];
+        if (mmax < val) mmax = val;
       }
-    } else {
-      for (index_t j = 0; j < len; ++j) {
-        in_val = negate ? -in[base + j*sa] : in[base + j*sa];
-        sum += std::exp((in_val - mmax)/temperature);
+      for (index_t j = len; j < M; ++j) {
+        out[base + j*sa] = OType(0.0f);
       }
 
-      for (index_t j = 0; j < len; ++j) {
-        in_val = negate ? -in[base + j*sa] : in[base + j*sa];
-        out[base + j*sa] = OP::Map((in_val - mmax)/temperature, sum);
+      AType sum = AType(0);
+      DType in_val;
+      // By default temperature is 1.0.
+      // Adding a branch here to save the CPU 'divide-by-1' computation at runtime
+      if (temperature == 1.0) {
+        for (index_t j = 0; j < len; ++j) {
+          in_val = negate ? -in[base + j*sa] : in[base + j*sa];
+          sum += std::exp(in_val - mmax);
+        }
+
+        for (index_t j = 0; j < len; ++j) {
+          in_val = negate ? -in[base + j*sa] : in[base + j*sa];
+          out[base + j*sa] = OP::Map(in_val - mmax, sum);
+        }
+      } else {
+        for (index_t j = 0; j < len; ++j) {
+          in_val = negate ? -in[base + j*sa] : in[base + j*sa];
+          sum += std::exp((in_val - mmax)/temperature);
+        }
+
+        for (index_t j = 0; j < len; ++j) {
+          in_val = negate ? -in[base + j*sa] : in[base + j*sa];
+          out[base + j*sa] = OP::Map((in_val - mmax)/temperature, sum);
+        }
       }
     }
   }
 }
 
-
 struct softmax_bwd {
   template<typename DType, typename AType>
   MSHADOW_XINLINE static AType Map(DType ograd, DType out, AType sum) {
@@ -188,12 +180,11 @@ struct log_softmax_bwd {
   }
 };
 
-
 template<typename OP1, typename OP2, int Req, bool negate,
-         typename AType, typename DType, typename OType, int ndim>
+         typename AType, typename DType, typename OType, typename IType, int ndim>
 inline void SoftmaxGrad(Stream<cpu> *s, OType *out, OType *ograd,
-                        DType *igrad, Shape<ndim> shape, int axis,
-                        const DType temperature) {
+                        DType *igrad, IType *length, Shape<ndim> shape,
+                        int axis, const DType temperature) {
   index_t M = shape[axis];
   index_t N = shape.Size()/M;
   Shape<ndim> stride = calc_stride(shape);
@@ -201,76 +192,65 @@ inline void SoftmaxGrad(Stream<cpu> *s, OType *out, OType *ograd,
   sshape[axis] = 1;
   index_t sa = stride[axis];
 
-  #pragma omp parallel for
-  for (index_t i = 0; i < N; ++i) {
-    index_t base = unravel_dot(i, sshape, stride);
-
-    AType sum = AType(0);
-    for (index_t j = 0; j < M; ++j) {
-      sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
-    }
+  if (length != nullptr) {
+    #pragma omp parallel for
+    for (index_t i = 0; i < N; ++i) {
+      index_t base = unravel_dot(i, sshape, stride);
+      index_t len = static_cast<index_t>(length[i]);
 
-    // By default temperature is 1.0.
-    // Adding a branch here to save the CPU 'divide-by-1' computation at runtime
-    DType final_result;
-    if (temperature == 1.0) {
-      for (index_t j = 0; j < M; ++j) {
-        final_result = negate ?
-                       -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) :
-                       OP2::Map(ograd[base + j*sa], out[base + j*sa], sum);
-        KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
-      }
-    } else {
-      for (index_t j = 0; j < M; ++j) {
-        final_result = negate ?
-                       -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / temperature :
-                       OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / temperature;
-        KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
+      AType sum = AType(0);
+      for (index_t j = 0; j < len; ++j) {
+        sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
       }
-    }
-  }
-}
-
-template<typename OP1, typename OP2, int Req, bool negate,
-         typename AType, typename DType, typename OType, typename IType, int ndim>
-inline void SoftmaxWithLengthGrad(Stream<cpu> *s, OType *out, OType *ograd,
-                                  DType *igrad, IType *length, Shape<ndim> shape,
-                                  int axis, const DType temperature) {
-  index_t M = shape[axis];
-  index_t N = shape.Size()/M;
-  Shape<ndim> stride = calc_stride(shape);
-  Shape<ndim> sshape = shape;
-  sshape[axis] = 1;
-  index_t sa = stride[axis];
-
-  #pragma omp parallel for
-  for (index_t i = 0; i < N; ++i) {
-    index_t base = unravel_dot(i, sshape, stride);
-    index_t len = static_cast<index_t>(length[i]);
 
-    AType sum = AType(0);
-    for (index_t j = 0; j < len; ++j) {
-      sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
+      // By default temperature is 1.0.
+      // Adding a branch here to save the CPU 'divide-by-1' computation at runtime
+      DType final_result;
+      if (temperature == 1.0) {
+        for (index_t j = 0; j < M; ++j) {
+          final_result = negate ?
+                         -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) :
+                         OP2::Map(ograd[base + j*sa], out[base + j*sa], sum);
+          final_result = (j < len) ? final_result : DType(0.0f);
+          KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
+        }
+      } else {
+        for (index_t j = 0; j < M; ++j) {
+          final_result = negate ?
+                         -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / temperature :
+                         OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / temperature;
+          final_result = (j < len) ? final_result : DType(0.0f);
+          KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
+        }
+      }
     }
+  } else {
+    #pragma omp parallel for
+    for (index_t i = 0; i < N; ++i) {
+      index_t base = unravel_dot(i, sshape, stride);
 
-    // By default temperature is 1.0.
-    // Adding a branch here to save the CPU 'divide-by-1' computation at runtime
-    DType final_result;
-    if (temperature == 1.0) {
+      AType sum = AType(0);
       for (index_t j = 0; j < M; ++j) {
-        final_result = negate ?
-                       -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) :
-                       OP2::Map(ograd[base + j*sa], out[base + j*sa], sum);
-        final_result = (j < len) ? final_result : DType(0.0f);
-        KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
+        sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
       }
-    } else {
-      for (index_t j = 0; j < M; ++j) {
-        final_result = negate ?
-                       -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / temperature :
-                       OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / temperature;
-        final_result = (j < len) ? final_result : DType(0.0f);
-        KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
+
+      // By default temperature is 1.0.
+      // Adding a branch here to save the CPU 'divide-by-1' computation at runtime
+      DType final_result;
+      if (temperature == 1.0) {
+        for (index_t j = 0; j < M; ++j) {
+          final_result = negate ?
+                         -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) :
+                         OP2::Map(ograd[base + j*sa], out[base + j*sa], sum);
+          KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
+        }
+      } else {
+        for (index_t j = 0; j < M; ++j) {
+          final_result = negate ?
+                         -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / temperature :
+                         OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / temperature;
+          KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
+        }
       }
     }
   }
@@ -279,18 +259,19 @@ inline void SoftmaxWithLengthGrad(Stream<cpu> *s, OType *out, OType *ograd,
 
 #ifdef __CUDACC__
 template<int x_bits, typename OP, bool negate, typename AType, int ndim,
-         typename DType, typename OType>
-__global__ void softmax_compute_kernel(DType *in, OType *out, index_t M, int axis,
-                                       Shape<ndim> sshape, Shape<ndim> stride,
-                                       const double temperature) {
+         typename DType, typename OType, typename IType>
+__global__ void softmax_compute_kernel(DType *in, OType *out, IType *length,
+                                       index_t M, int axis, Shape<ndim> sshape,
+                                       Shape<ndim> stride, const double temperature) {
   const unsigned x_size = 1 << x_bits;
   __shared__ AType smem[x_size];
   index_t sa = stride[axis];
   index_t base = unravel_dot(blockIdx.x, sshape, stride);
   index_t x = threadIdx.x;
+  const index_t len = length == nullptr ? M : static_cast<index_t>(length[blockIdx.x]);
 
   red::maximum::SetInitValue(smem[x]);
-  for (index_t i = x; i < M; i += x_size) {
+  for (index_t i = x; i < len; i += x_size) {
     smem[x] = ::max(smem[x], negate ? -in[base + i*sa] : in[base + i*sa]);
   }
   __syncthreads();
@@ -301,7 +282,7 @@ __global__ void softmax_compute_kernel(DType *in, OType *out, index_t M, int axi
 
   red::sum::SetInitValue(smem[x]);
   DType val;
-  for (index_t i = x; i < M; i += x_size) {
+  for (index_t i = x; i < len; i += x_size) {
     val = negate ? -in[base + i*sa]:in[base + i*sa];
     smem[x] += static_cast<AType>(expf((val - smax) / static_cast<AType>(temperature)));
   }
@@ -313,71 +294,110 @@ __global__ void softmax_compute_kernel(DType *in, OType *out, index_t M, int axi
 
   for (index_t i = x; i < M; i += x_size) {
     val = negate ? -in[base + i*sa] : in[base + i*sa];
-    out[base + i*sa] = OP::Map((val - smax)/static_cast<DType>(temperature), ssum);
+    out[base + i*sa] =
+      (i < len) ? OType(OP::Map((val - smax)/static_cast<DType>(temperature), ssum)) : OType(0.0f);
   }
 }
 
-template<typename OP, bool negate, typename AType, typename DType, typename OType, int ndim>
-inline void Softmax(Stream<gpu> *s, DType *in, OType *out,
-                    Shape<ndim> shape, int axis, const double temperature) {
-  const int x_bits = 7;
-  const int x_size = 1 << x_bits;
-  index_t M = shape[axis];
-  index_t N = shape.Size()/M;
-  Shape<ndim> stride = calc_stride(shape);
-  Shape<ndim> sshape = shape;
-  sshape[axis] = 1;
-
-  softmax_compute_kernel<x_bits, OP, negate, AType, ndim>
-    <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
-      in, out, M, axis, sshape, stride, temperature);
-  MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_compute_kernel);
-}
+const int softmax_threads_per_block = 512;
+
+template<typename OP, bool negate, typename AType, typename LType,
+  typename DType, typename OType, typename IType>
+__global__ void softmax_stride1_compute_kernel(const DType *in, OType *out, IType *length,
+                                               const index_t M, const double temperature,
+                                               const int rows_per_block, const index_t total_rows) {
+  __shared__ AType scratch[softmax_threads_per_block];
+  __shared__ LType persistent_storage[20 * 1024 / sizeof(LType)];
+  const int warp_size = 32;
+  const int threads_per_row = softmax_threads_per_block / rows_per_block;
+  const int my_local_row = threadIdx.x / threads_per_row;
+  const int my_row = blockIdx.x * rows_per_block + my_local_row;
+  if (my_row >= total_rows) return;
+  const int my_id = threadIdx.x % threads_per_row;
+  const int entries_per_load = sizeof(LType)/sizeof(DType);
+  const index_t len = length == nullptr ? M : static_cast<index_t>(length[my_row]);
+  // Due to usage of MSHADOW_TYPE_SWITCH macro we are generating
+  // kernels where sizeof(LType) may be less than sizeof(DType),
+  // resulting in entries_per_load being 0.
+  // This is not a valid combination and is being checked against
+  // in the launcher code. This switch here is just to silence
+  // the division by zero warning generated for such invalid cases.
+  const int row_length = entries_per_load > 0 ? M / entries_per_load : 0;
+
+  const LType* in_aligned = reinterpret_cast<const LType*>(in);
+  size_t base = my_row * row_length;
+
+  for (index_t i = my_id; i < row_length; i += threads_per_row) {
+    persistent_storage[my_local_row * row_length + i] = in_aligned[base + i];
+  }
+  DType * row = reinterpret_cast<DType *>(persistent_storage + my_local_row * row_length);
+  __syncthreads();
 
-template<int x_bits, typename OP, bool negate, typename AType, int ndim,
-         typename DType, typename OType, typename IType>
-__global__ void softmax_with_length_kernel(DType *in, OType *out, IType *length,
-                                           index_t M, int axis, Shape<ndim> sshape,
-                                           Shape<ndim> stride, const double temperature) {
-  const unsigned x_size = 1 << x_bits;
-  __shared__ AType smem[x_size];
-  index_t sa = stride[axis];
-  index_t base = unravel_dot(blockIdx.x, sshape, stride);
-  index_t x = threadIdx.x;
-  index_t len = static_cast<index_t>(length[blockIdx.x]);
+  DType my_max_value;
+  red::maximum::SetInitValue(my_max_value);
 
-  red::maximum::SetInitValue(smem[x]);
-  for (index_t i = x; i < len; i += x_size) {
-    smem[x] = ::max(smem[x], negate ? -in[base + i*sa] : in[base + i*sa]);
+  for (index_t i = my_id; i < len; i += threads_per_row) {
+    my_max_value = ::max(my_max_value, negate ? -row[i] : row[i]);
   }
+  scratch[threadIdx.x] = my_max_value;
   __syncthreads();
-  cuda::Reduce1D<red::maximum, x_bits>(smem);
+  for (int size = threads_per_row / 2; size >= warp_size; size /= 2) {
+    if (my_id < size) {
+      scratch[threadIdx.x] = ::max(scratch[threadIdx.x], scratch[threadIdx.x + size]);
+    }
+    __syncthreads();
+  }
+  if (my_id < warp_size) {
+    AType my_value = warp_reduce(scratch[threadIdx.x],
+                                 [](AType x, AType y) { return ::max(x, y); });
+    scratch[threadIdx.x] = my_value;
+  }
   __syncthreads();
-  DType smax = smem[0];
+  DType smax = scratch[threadIdx.x - threadIdx.x % threads_per_row];
   __syncthreads();
 
-  red::sum::SetInitValue(smem[x]);
-  DType val;
-  for (index_t i = x; i < len; i += x_size) {
-    val = negate ? -in[base + i*sa]:in[base + i*sa];
-    smem[x] += static_cast<AType>(expf((val - smax) / static_cast<AType>(temperature)));
+  AType my_sum;
+  red::sum::SetInitValue(my_sum);
+
+  for (index_t i = my_id; i < len; i += threads_per_row) {
+    const DType val = negate ? -row[i] : row[i];
+    my_sum += static_cast<AType>(expf((val - smax) / static_cast<AType>(temperature)));
   }
+  scratch[threadIdx.x] = my_sum;
   __syncthreads();
-  cuda::Reduce1D<red::sum, x_bits>(smem);
+  for (int size = threads_per_row / 2; size >= warp_size; size /= 2) {
+    if (my_id < size) {
+      scratch[threadIdx.x] += scratch[threadIdx.x + size];
+    }
+    __syncthreads();
+  }
+  if (my_id < warp_size) {
+    AType my_value = warp_reduce(scratch[threadIdx.x],
+                                 [](AType x, AType y) { return x + y;});
+    scratch[threadIdx.x] = my_value;
+  }
   __syncthreads();
-  AType ssum = smem[0];
+
+  AType ssum = scratch[threadIdx.x - threadIdx.x % threads_per_row];
   __syncthreads();
 
-  for (index_t i = x; i < M; i += x_size) {
-    val = negate ? -in[base + i*sa] : in[base + i*sa];
-    out[base + i*sa] =
-      (i < len) ? OType(OP::Map((val - smax)/static_cast<DType>(temperature), ssum)) : OType(0.0f);
+  for (index_t i = my_id; i < M; i += threads_per_row) {
+    const DType val = negate ? -row[i] : row[i];
+    row[i] = (i < len) ? DType(OP::Map((val - smax)/static_cast<DType>(temperature), ssum)) :
+                         DType(0.0f);
+  }
+  __syncthreads();
+
+  LType* out_aligned = reinterpret_cast<LType*>(out);
+
+  for (index_t i = my_id; i < row_length; i += threads_per_row) {
+    out_aligned[base + i] = persistent_storage[my_local_row * row_length + i];
   }
 }
 
 template<typename OP, bool negate, typename AType, typename DType, typename OType,
          typename IType, int ndim>
-inline void SoftmaxWithLength(Stream<gpu> *s, DType *in, OType *out, IType *length,
+inline void Softmax(Stream<gpu> *s, DType *in, OType *out, IType *length,
                     Shape<ndim> shape, int axis, const double temperature) {
   const int x_bits = 7;
   const int x_size = 1 << x_bits;
@@ -387,76 +407,124 @@ inline void SoftmaxWithLength(Stream<gpu> *s, DType *in, OType *out, IType *leng
   Shape<ndim> sshape = shape;
   sshape[axis] = 1;
 
-  softmax_with_length_kernel<x_bits, OP, negate, AType, ndim>
-    <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
-      in, out, length, M, axis, sshape, stride, temperature);
-  MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_compute_kernel);
+  const size_t DSize = sizeof(DType);
+  // Using 20 kB of shared memory for persistent storage in the optimized case
+  const size_t max_opt_M = 20 * 1024 / DSize;
+  if (stride[axis] == 1 &&
+      static_cast<size_t>(M) <= max_opt_M &&
+      std::is_same<DType, OType>::value) {
+    int ltype = mxnet::common::cuda::get_load_type(M * sizeof(DType));
+    MXNET_LOAD_TYPE_SWITCH(ltype, LType, {
+      int rows_per_block = mxnet::common::cuda::get_rows_per_block(M *
+                                                                   sizeof(DType) / sizeof(LType),
+                                                                   softmax_threads_per_block);
+      int nblocks = (N + rows_per_block - 1) / rows_per_block;
+      CHECK_LE(sizeof(DType), sizeof(LType));
+      softmax_stride1_compute_kernel<OP, negate, AType, LType>
+        <<<nblocks, softmax_threads_per_block, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+          in, out, length, M, temperature, rows_per_block, N);
+    });
+    MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_stride1_compute_kernel);
+  } else {
+    softmax_compute_kernel<x_bits, OP, negate, AType, ndim>
+      <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+        in, out, length, M, axis, sshape, stride, temperature);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_compute_kernel);
+  }
 }
 
+template<typename OP1, typename OP2, int Req, bool negate, typename AType, typename LType,
+  typename DType, typename OType, typename IType>
+__global__ void softmax_stride1_grad_kernel(const OType *out, const OType *ograd,
+                                            DType *igrad, const IType *length,
+                                            const index_t M,
+                                            const double temperature,
+                                            const int rows_per_block,
+                                            const index_t total_rows) {
+  __shared__ AType scratch[softmax_threads_per_block];
+  __shared__ LType persistent_storage[20 * 1024 / sizeof(LType)];
+  const int warp_size = 32;
+  const int threads_per_row = softmax_threads_per_block / rows_per_block;
+  const int my_local_row = threadIdx.x / threads_per_row;
+  const int my_row = blockIdx.x * rows_per_block + my_local_row;
+  if (my_row >= total_rows) return;
+  const int my_id = threadIdx.x % threads_per_row;
+  const int entries_per_load = sizeof(LType)/sizeof(DType);
+  const index_t len = length == nullptr ? M : static_cast<index_t>(length[my_row]);
+  // Due to usage of MSHADOW_TYPE_SWITCH macro we are generating
+  // kernels where sizeof(LType) may be less than sizeof(DType),
+  // resulting in entries_per_load being 0.
+  // This is not a valid combination and is being checked against
+  // in the launcher code. This switch here is just to silence
+  // the division by zero warning generated for such invalid cases.
+  const int row_length = entries_per_load > 0 ? M / entries_per_load : 0;
+
+  const LType* out_aligned = reinterpret_cast<const LType*>(out);
+  const LType* ograd_aligned = reinterpret_cast<const LType*>(ograd);
+  size_t base = my_row * row_length;
+
+  for (index_t i = my_id; i < row_length; i += threads_per_row) {
+    persistent_storage[my_local_row * row_length * 2 + i] = out_aligned[base + i];
+    persistent_storage[my_local_row * row_length * 2 + row_length + i] = ograd_aligned[base + i];
+  }
+  DType * row = reinterpret_cast<DType *>(persistent_storage + my_local_row * row_length * 2);
+  __syncthreads();
 
-template<int x_bits, typename OP1, typename OP2, int Req, bool negate, typename AType, int ndim,
-         typename DType, typename OType>
-__global__ void softmax_gradient_kernel(OType *out, OType *ograd, DType *igrad,
-                                        index_t M, int axis, Shape<ndim> sshape,
-                                        Shape<ndim> stride, const double temperature) {
-  const unsigned x_size = 1 << x_bits;
-  __shared__ AType smem[x_size];
-  index_t sa = stride[axis];
-  index_t base = unravel_dot(blockIdx.x, sshape, stride);
-  index_t x = threadIdx.x;
+  AType my_sum_value;
+  red::sum::SetInitValue(my_sum_value);
 
-  red::sum::SetInitValue(smem[x]);
-  for (index_t i = x; i < M; i += x_size) {
-    smem[x] += OP1::Map(ograd[base + i*sa], out[base + i*sa]);
+  for (index_t i = my_id; i < len; i += threads_per_row) {
+    my_sum_value += OP1::Map(row[i + M], row[i]);
   }
+  scratch[threadIdx.x] = my_sum_value;
   __syncthreads();
-  cuda::Reduce1D<red::sum, x_bits>(smem);
+  for (int size = threads_per_row / 2; size >= warp_size; size /= 2) {
+    if (my_id < size) {
+      scratch[threadIdx.x] = scratch[threadIdx.x] + scratch[threadIdx.x + size];
+    }
+    __syncthreads();
+  }
+  if (my_id < warp_size) {
+    AType my_value = warp_reduce(scratch[threadIdx.x],
+                                 [](AType x, AType y) { return x + y; });
+    scratch[threadIdx.x] = my_value;
+  }
   __syncthreads();
-  AType ssum = smem[0];
+  AType ssum = scratch[threadIdx.x - threadIdx.x % threads_per_row];
   __syncthreads();
 
-  DType final_result;
-  for (index_t i = x; i < M; i += x_size) {
-    final_result =
+  for (index_t i = my_id; i < M; i += threads_per_row) {
+    const DType val =
       negate ?
-      -OP2::Map(ograd[base + i*sa], out[base + i*sa], ssum) :
-      OP2::Map(ograd[base + i*sa], out[base + i*sa], ssum);
-    KERNEL_ASSIGN(igrad[base + i*sa], Req, final_result / static_cast<DType>(temperature));
+      -OP2::Map(row[i + M], row[i], ssum) :
+      OP2::Map(row[i + M], row[i], ssum);
+    row[i] = (i < len) ? DType(val / static_cast<DType>(temperature)) :
+                         DType(0.0f);
+    if (Req == kAddTo) {
+      row[i] += igrad[my_row * M + i];
+    }
   }
-}
-
+  __syncthreads();
 
-template<typename OP1, typename OP2, int Req, bool negate, typename AType, int ndim,
-         typename DType, typename OType>
-inline void SoftmaxGrad(Stream<gpu> *s, OType *out, OType *ograd,
-                        DType *igrad, Shape<ndim> shape, int axis,
-                        const double temperature) {
-  const int x_bits = 7;
-  const int x_size = 1 << x_bits;
-  index_t M = shape[axis];
-  index_t N = shape.Size()/M;
-  Shape<ndim> stride = calc_stride(shape);
-  Shape<ndim> sshape = shape;
-  sshape[axis] = 1;
+  LType* igrad_aligned = reinterpret_cast<LType*>(igrad);
 
-  softmax_gradient_kernel<x_bits, OP1, OP2, Req, negate, AType, ndim>
-    <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
-      out, ograd, igrad, M, axis, sshape, stride, temperature);
-  MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_gradient_kernel);
+  for (index_t i = my_id; i < row_length; i += threads_per_row) {
+    igrad_aligned[base + i] = persistent_storage[my_local_row * row_length * 2 + i];
+  }
 }
 
 template<int x_bits, typename OP1, typename OP2, int Req, bool negate, typename AType, int ndim,
          typename DType, typename OType, typename IType>
-__global__ void softmax_with_length_grad_kernel(OType *out, OType *ograd, DType *igrad,
-                                                IType *length, index_t M, int axis,
-                                                Shape<ndim> sshape, Shape<ndim> stride,
-                                                const double temperature) {
+__global__ void softmax_grad_kernel(OType *out, OType *ograd, DType *igrad,
+                                    const IType *length, index_t M, int axis,
+                                    Shape<ndim> sshape, Shape<ndim> stride,
+                                    const double temperature) {
   const unsigned x_size = 1 << x_bits;
   __shared__ AType smem[x_size];
   index_t sa = stride[axis];
   index_t base = unravel_dot(blockIdx.x, sshape, stride);
   index_t x = threadIdx.x;
-  index_t len = static_cast<index_t>(length[blockIdx.x]);
+  index_t len = length != nullptr ? static_cast<index_t>(length[blockIdx.x]) : M;
 
   red::sum::SetInitValue(smem[x]);
   for (index_t i = x; i < len; i += x_size) {
@@ -479,12 +547,11 @@ __global__ void softmax_with_length_grad_kernel(OType *out, OType *ograd, DType
   }
 }
 
-
 template<typename OP1, typename OP2, int Req, bool negate, typename AType, int ndim,
          typename DType, typename OType, typename IType>
-inline void SoftmaxWithLengthGrad(Stream<gpu> *s, OType *out, OType *ograd,
-                                  DType *igrad, IType *length, Shape<ndim> shape, int axis,
-                                  const double temperature) {
+inline void SoftmaxGrad(Stream<gpu> *s, OType *out, OType *ograd,
+                        DType *igrad, IType *length, Shape<ndim> shape, int axis,
+                        const double temperature) {
   const int x_bits = 7;
   const int x_size = 1 << x_bits;
   index_t M = shape[axis];
@@ -493,10 +560,32 @@ inline void SoftmaxWithLengthGrad(Stream<gpu> *s, OType *out, OType *ograd,
   Shape<ndim> sshape = shape;
   sshape[axis] = 1;
 
-  softmax_with_length_grad_kernel<x_bits, OP1, OP2, Req, negate, AType, ndim>
-    <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
-      out, ograd, igrad, length, M, axis, sshape, stride, temperature);
-  MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_with_length_grad_kernel);
+  const size_t DSize = sizeof(DType);
+  // Using 20 kB of shared memory for persistent storage in the optimized case
+  // Need to store both out and ograd, so M can be only half compared to
+  // forward pass.
+  const size_t max_opt_M = 20 * 1024 / DSize / 2;
+  if (stride[axis] == 1 &&
+      static_cast<size_t>(M) <= max_opt_M &&
+      std::is_same<DType, OType>::value) {
+    int ltype = mxnet::common::cuda::get_load_type(M * sizeof(DType));
+    MXNET_LOAD_TYPE_SWITCH(ltype, LType, {
+      int rows_per_block = mxnet::common::cuda::get_rows_per_block(M *
+                                                                   sizeof(DType) / sizeof(LType),
+                                                                   softmax_threads_per_block);
+      int nblocks = (N + rows_per_block - 1) / rows_per_block;
+      CHECK_LE(sizeof(DType), sizeof(LType));
+      softmax_stride1_grad_kernel<OP1, OP2, Req, negate, AType, LType>
+        <<<nblocks, softmax_threads_per_block, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+          out, ograd, igrad, length, M, temperature, rows_per_block, N);
+    });
+    MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_stride1_grad_kernel);
+  } else {
+    softmax_grad_kernel<x_bits, OP1, OP2, Req, negate, AType, ndim>
+      <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+        out, ograd, igrad, length, M, axis, sshape, stride, temperature);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_grad_kernel);
+  }
 }
 #endif
 
@@ -564,8 +653,9 @@ static inline bool SoftmaxOpShape(const nnvm::NodeAttrs& attrs,
     mxnet::TShape& dshape = in_attrs->at(0);
     mxnet::TShape tmp_shape((dshape.ndim() == 1) ? 1U : dshape.ndim() - 1, 1);
     int j = 0;
+    int axis = param.axis != -1 ? param.axis : dshape.ndim() - 1;
     for (int i = 0; i < dshape.ndim(); ++i) {
-      if (i != param.axis) {
+      if (i != axis) {
         tmp_shape[j++] = dshape[i];
       }
     }
@@ -694,47 +784,43 @@ void SoftmaxCompute(const nnvm::NodeAttrs& attrs,
 
   MXNET_REAL_ACC_TYPE_SWITCH(inputs[0].type_flag_, DType, AType, {
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
-      if (!param.use_length.value()) {
-        if (safe_acc) {
-          if (shape.ndim() == 2) {
-            Softmax<OP, negate, AType>(
-                ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
-                outputs[0].dptr<OType>(), shape.get<2>(), axis,
-                static_cast<DType>(temperature));
-          } else {
-            Softmax<OP, negate, AType>(
-                ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
-                outputs[0].dptr<OType>(), shape.get<3>(), axis,
-                static_cast<DType>(temperature));
-          }
-        } else {
-          if (shape.ndim() == 2) {
-            Softmax<OP, negate, DType>(
-                ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
-                outputs[0].dptr<OType>(), shape.get<2>(), axis,
-                static_cast<DType>(temperature));
-          } else {
-            Softmax<OP, negate, DType>(
-                ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
-                outputs[0].dptr<OType>(), shape.get<3>(), axis,
-                static_cast<DType>(temperature));
+      int type = kInt32;
+      if (param.use_length.value()) {
+        CHECK(inputs.size() > 1)
+          << "Mask needs to be provided when using softmax with use_length=True.";
+        type = inputs[1].type_flag_;
+      }
+      MXNET_INT_TYPE_SWITCH(type, IType, {
+          IType* mask_ptr = nullptr;
+          if (param.use_length.value()) {
+            mask_ptr = inputs[1].dptr<IType>();
           }
-        }
-      } else {
-        MXNET_INT_TYPE_SWITCH(inputs[1].type_flag_, IType, {
-          if (shape.ndim() == 2) {
-            SoftmaxWithLength<OP, negate, AType>(
-              ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
-              outputs[0].dptr<OType>(), inputs[1].dptr<IType>(),
-              shape.get<2>(), axis, static_cast<DType>(temperature));
+          if (safe_acc) {
+            if (shape.ndim() == 2) {
+              Softmax<OP, negate, AType>(
+                  ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+                  outputs[0].dptr<OType>(), mask_ptr, shape.get<2>(),
+                  axis, static_cast<DType>(temperature));
+            } else {
+              Softmax<OP, negate, AType>(
+                  ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+                  outputs[0].dptr<OType>(), mask_ptr, shape.get<3>(),
+                  axis, static_cast<DType>(temperature));
+            }
           } else {
-            SoftmaxWithLength<OP, negate, AType>(
-              ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
-              outputs[0].dptr<OType>(), inputs[1].dptr<IType>(),
-              shape.get<3>(), axis, static_cast<DType>(temperature));
+            if (shape.ndim() == 2) {
+              Softmax<OP, negate, DType>(
+                  ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+                  outputs[0].dptr<OType>(), mask_ptr, shape.get<2>(),
+                  axis, static_cast<DType>(temperature));
+            } else {
+              Softmax<OP, negate, DType>(
+                  ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+                  outputs[0].dptr<OType>(), mask_ptr, shape.get<3>(),
+                  axis, static_cast<DType>(temperature));
+            }
           }
-        });
-      }
+      });
     });
   });
 }
@@ -747,7 +833,16 @@ void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
                         const std::vector<OpReqType>& req,
                         const std::vector<TBlob>& outputs) {
   using namespace mxnet_op;
+  if (softmax_use_length(attrs)) {
+    MXNET_INT_TYPE_SWITCH(inputs[2].type_flag_, IType, {
+      if (req[1] != kNullOp) {
+        mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(
+          ctx.get_stream<xpu>(), outputs[1].Size(), outputs[1].dptr<IType>());
+      }
+    });
+  }
   if (req[0] == kNullOp) return;
+  const int itype = softmax_use_length(attrs) ? inputs[2].type_flag_ : kInt32;
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
   int axis = CheckAxis(param.axis, inputs[0].ndim());
   const double temperature = param.temperature.has_value() ?
@@ -761,51 +856,41 @@ void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
   MXNET_REAL_ACC_TYPE_SWITCH(inputs[0].type_flag_, OType, AType, {
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        if (!softmax_use_length(attrs)) {
+        MXNET_INT_TYPE_SWITCH(itype, IType, {
+          IType * length_ptr = nullptr;
+          if (softmax_use_length(attrs)) {
+            length_ptr = inputs[2].dptr<IType>();
+          }
           if (safe_acc) {
             if (shape.ndim() == 2) {
               SoftmaxGrad<OP1, OP2, Req, negate, AType>(
                   ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
                   inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
-                  shape.get<2>(), axis, static_cast<DType>(temperature));
+                  length_ptr, shape.get<2>(), axis,
+                  static_cast<DType>(temperature));
             } else {
               SoftmaxGrad<OP1, OP2, Req, negate, AType>(
                   ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
                   inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
-                  shape.get<3>(), axis, static_cast<DType>(temperature));
+                  length_ptr, shape.get<3>(), axis,
+                  static_cast<DType>(temperature));
             }
           } else {
             if (shape.ndim() == 2) {
               SoftmaxGrad<OP1, OP2, Req, negate, DType>(
                   ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
                   inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
-                  shape.get<2>(), axis, static_cast<DType>(temperature));
+                  length_ptr, shape.get<2>(), axis,
+                  static_cast<DType>(temperature));
             } else {
               SoftmaxGrad<OP1, OP2, Req, negate, DType>(
                   ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
                   inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
-                  shape.get<3>(), axis, static_cast<DType>(temperature));
+                  length_ptr, shape.get<3>(), axis,
+                  static_cast<DType>(temperature));
             }
           }
-        } else {
-          MXNET_INT_TYPE_SWITCH(inputs[2].type_flag_, IType, {
-            if (req[1] != kNullOp) {
-              mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(
-                ctx.get_stream<xpu>(), outputs[1].Size(), outputs[1].dptr<IType>());
-            }
-            if (shape.ndim() == 2) {
-              SoftmaxWithLengthGrad<OP1, OP2, Req, negate, AType>(
-                ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
-                inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
-                inputs[2].dptr<IType>(), shape.get<2>(), axis, static_cast<DType>(temperature));
-            } else {
-              SoftmaxWithLengthGrad<OP1, OP2, Req, negate, AType>(
-                ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
-                inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
-                inputs[2].dptr<IType>(), shape.get<3>(), axis, static_cast<DType>(temperature));
-            }
-          });
-        }
+        });
       });
     });
   });
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index b9a31ccc9313..ce19dda23d87 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -150,104 +150,5 @@ NNVM_REGISTER_OP(_backward_softmax)
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, op::mshadow_op::mul,
                                                         mxnet_op::softmax_bwd>);
 
-NNVM_REGISTER_OP(softmin)
-.describe(R"code(Applies the softmin function.
-
-The resulting array contains elements in the range (0,1) and the elements along the given axis sum
-up to 1.
-
-.. math::
-   softmin(\mathbf{z/t})_j = \frac{e^{-z_j/t}}{\sum_{k=1}^K e^{-z_k/t}}
-
-for :math:`j = 1, ..., K`
-
-t is the temperature parameter in softmax function. By default, t equals 1.0
-
-Example::
-
-  x = [[ 1.  2.  3.]
-       [ 3.  2.  1.]]
-
-  softmin(x,axis=0) = [[ 0.88079703,  0.5,  0.11920292],
-                       [ 0.11920292,  0.5,  0.88079703]]
-
-  softmin(x,axis=1) = [[ 0.66524094,  0.24472848,  0.09003057],
-                       [ 0.09003057,  0.24472848,  0.66524094]]
-
-)code" ADD_FILELINE)
-.set_attr_parser(ParamParser<SoftmaxParam>)
-.set_attr<nnvm::FListOutputNames>("FListOutputNames",
-    [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"output"};
-})
-.set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::softmax_fwd, true>)
-.set_attr<nnvm::FGradient>("FGradient", SoftmaxFGradient{"_backward_softmin"})
-.set_attr<nnvm::FInferType>("FInferType", SoftmaxOpType)
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 0}};
-  })
-.add_argument("data", "NDArray-or-Symbol", "The input array.")
-.add_arguments(SoftmaxParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_backward_softmin)
-.set_num_inputs(SoftmaxGradOpNumInputs)
-.set_num_outputs(1)
-.set_attr<nnvm::FListInputNames>("FListInputNames", SoftmaxGradOpInputNames)
-.set_attr<mxnet::FInferShape>("FInferShape", SoftmaxGradOpShape)
-.set_attr<nnvm::FInferType>("FInferType", SoftmaxGradOpType)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption", SoftmaxGradOpInplaceOption)
-.add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments")
-.set_attr_parser(ParamParser<SoftmaxParam>)
-.set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, op::mshadow_op::mul,
-                                                        mxnet_op::softmax_bwd, true>);
-
-NNVM_REGISTER_OP(log_softmax)
-.add_alias("_npx_log_softmax")
-.describe(R"code(Computes the log softmax of the input.
-This is equivalent to computing softmax followed by log.
-
-Examples::
-
-  >>> x = mx.nd.array([1, 2, .1])
-  >>> mx.nd.log_softmax(x).asnumpy()
-  array([-1.41702998, -0.41702995, -2.31702995], dtype=float32)
-
-  >>> x = mx.nd.array( [[1, 2, .1],[.1, 2, 1]] )
-  >>> mx.nd.log_softmax(x, axis=0).asnumpy()
-  array([[-0.34115392, -0.69314718, -1.24115396],
-         [-1.24115396, -0.69314718, -0.34115392]], dtype=float32)
-
-
-)code")
-.set_attr_parser(ParamParser<SoftmaxParam>)
-.set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::log_softmax_fwd>)
-.set_attr<nnvm::FGradient>("FGradient", SoftmaxFGradient{"_backward_log_softmax"})
-.set_attr<nnvm::FInferType>("FInferType", SoftmaxOpType)
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 0}};
-  })
-.add_argument("data", "NDArray-or-Symbol", "The input array.")
-.add_arguments(SoftmaxParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_backward_log_softmax)
-.set_num_inputs(SoftmaxGradOpNumInputs)
-.set_num_outputs(1)
-.set_attr<nnvm::FListInputNames>("FListInputNames", SoftmaxGradOpInputNames)
-.set_attr<mxnet::FInferShape>("FInferShape", SoftmaxGradOpShape)
-.set_attr<nnvm::FInferType>("FInferType", SoftmaxGradOpType)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption", SoftmaxGradOpInplaceOption)
-.add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments")
-.set_attr_parser(ParamParser<SoftmaxParam>)
-.set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, mshadow_op::left,
-                                                        mxnet_op::log_softmax_bwd>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/softmax.cu b/src/operator/nn/softmax.cu
index 254e726d5e26..d5762cf0217f 100644
--- a/src/operator/nn/softmax.cu
+++ b/src/operator/nn/softmax.cu
@@ -19,8 +19,8 @@
 
 /*!
  * Copyright (c) 2017 by Contributors
- * \file softmax.cc
- * \brief CPU Implementation of softmax
+ * \file softmax.cu
+ * \brief GPU Implementation of softmax
  */
 #include "./softmax-inl.h"
 #include "../tensor/elemwise_unary_op.h"
@@ -35,19 +35,5 @@ NNVM_REGISTER_OP(_backward_softmax)
 .set_attr<FCompute>("FCompute<gpu>", SoftmaxGradCompute<gpu, op::mshadow_op::mul,
                                                         mxnet_op::softmax_bwd>);
 
-NNVM_REGISTER_OP(softmin)
-.set_attr<FCompute>("FCompute<gpu>", SoftmaxCompute<gpu, mxnet_op::softmax_fwd, true>);
-
-NNVM_REGISTER_OP(_backward_softmin)
-.set_attr<FCompute>("FCompute<gpu>", SoftmaxGradCompute<gpu, op::mshadow_op::mul,
-                                                        mxnet_op::softmax_bwd, true>);
-
-NNVM_REGISTER_OP(log_softmax)
-.set_attr<FCompute>("FCompute<gpu>", SoftmaxCompute<gpu, mxnet_op::log_softmax_fwd>);
-
-NNVM_REGISTER_OP(_backward_log_softmax)
-.set_attr<FCompute>("FCompute<gpu>", SoftmaxGradCompute<gpu, mshadow_op::left,
-                                                        mxnet_op::log_softmax_bwd>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/softmin.cc b/src/operator/nn/softmin.cc
new file mode 100644
index 000000000000..0522c8c9b120
--- /dev/null
+++ b/src/operator/nn/softmin.cc
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file softmax.cc
+ * \brief CPU Implementation of softmin
+ */
+#include "./softmax-inl.h"
+#include "../tensor/elemwise_unary_op.h"
+#include "../tensor/elemwise_binary_op.h"
+#include "../operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(softmin)
+.describe(R"code(Applies the softmin function.
+
+The resulting array contains elements in the range (0,1) and the elements along the given axis sum
+up to 1.
+
+.. math::
+   softmin(\mathbf{z/t})_j = \frac{e^{-z_j/t}}{\sum_{k=1}^K e^{-z_k/t}}
+
+for :math:`j = 1, ..., K`
+
+t is the temperature parameter in softmax function. By default, t equals 1.0
+
+Example::
+
+  x = [[ 1.  2.  3.]
+       [ 3.  2.  1.]]
+
+  softmin(x,axis=0) = [[ 0.88079703,  0.5,  0.11920292],
+                       [ 0.11920292,  0.5,  0.88079703]]
+
+  softmin(x,axis=1) = [[ 0.66524094,  0.24472848,  0.09003057],
+                       [ 0.09003057,  0.24472848,  0.66524094]]
+
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"output"};
+})
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::softmax_fwd, true>)
+.set_attr<nnvm::FGradient>("FGradient", SoftmaxFGradient{"_backward_softmin"})
+.set_attr<nnvm::FInferType>("FInferType", SoftmaxOpType)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("data", "NDArray-or-Symbol", "The input array.")
+.add_arguments(SoftmaxParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_softmin)
+.set_num_inputs(SoftmaxGradOpNumInputs)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", SoftmaxGradOpInputNames)
+.set_attr<mxnet::FInferShape>("FInferShape", SoftmaxGradOpShape)
+.set_attr<nnvm::FInferType>("FInferType", SoftmaxGradOpType)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", SoftmaxGradOpInplaceOption)
+.add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments")
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, op::mshadow_op::mul,
+                                                        mxnet_op::softmax_bwd, true>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nn/softmin.cu b/src/operator/nn/softmin.cu
new file mode 100644
index 000000000000..d00d0bdad231
--- /dev/null
+++ b/src/operator/nn/softmin.cu
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file softmin.cu
+ * \brief GPU Implementation of softmin
+ */
+#include "./softmax-inl.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(softmin)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxCompute<gpu, mxnet_op::softmax_fwd, true>);
+
+NNVM_REGISTER_OP(_backward_softmin)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxGradCompute<gpu, op::mshadow_op::mul,
+                                                        mxnet_op::softmax_bwd, true>);
+
+}  // namespace op
+}  // namespace mxnet