diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 93cbda213158..76984367dc0a 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -189,7 +189,7 @@ function(detect_cuDNN)
             DOC "Path to cuDNN include directory." )
 
   get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
-  find_library(CUDNN_LIBRARY NAMES libcudnn.so # libcudnn_static.a
+  find_library(CUDNN_LIBRARY NAMES libcudnn.so cudnn.lib # libcudnn_static.a
                              PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDNN_INCLUDE} ${__libpath_hist}
                              DOC "Path to cuDNN library.")
 
diff --git a/guide/basic.cpp b/guide/basic.cpp
index 3ea3c776e9c8..6ad74ec3ef06 100644
--- a/guide/basic.cpp
+++ b/guide/basic.cpp
@@ -12,7 +12,7 @@ int main(void) {
   float data[20];
   // create a 2 x 5 x 2 tensor, from existing space
   Tensor<cpu, 3> ts(data, Shape3(2,5,2));
-    // take first subscript of the tensor
+  // take first subscript of the tensor
   Tensor<cpu, 2> mat = ts[0];
   // Tensor object is only a handle, assignment means they have same data content
   // we can specify content type of a Tensor, if not specified, it is float bydefault
@@ -69,6 +69,16 @@ int main(void) {
   }
   printf("\n");
 
+  TensorContainer<cpu, 2> recover_lhs(Shape2(2, 3)), small_mat(Shape2(2, 3));
+  small_mat = -100.0f;
+  recover_lhs = mat_fill_row_element(small_mat, choosed, index);
+  for (index_t i = 0; i < recover_lhs.size(0); ++i) {
+    for (index_t j = 0; j < recover_lhs.size(1); ++j) {
+      printf("%.2f ", recover_lhs[i][j] - lhs[i][j]);
+    }
+  }
+  printf("\n");
+
   rhs = one_hot_encode(index, 3);
 
   for (index_t i = 0; i < lhs.size(0); ++i) {
diff --git a/mshadow/random.h b/mshadow/random.h
index 2b28305017c4..f4ab2384cde1 100644
--- a/mshadow/random.h
+++ b/mshadow/random.h
@@ -317,22 +317,30 @@ class Random<gpu, DType> {
   inline void GenGaussian(float *dptr, size_t size, float mu, float sigma) {
     curandStatus_t status;
     status = curandGenerateNormal(gen_, dptr, size, mu, sigma);
-    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen Uniform failed";
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen Normal float failed."
+                                            << " size = " << size
+                                            << ",mu = " << mu
+                                            << ",sigma = " << sigma;
   }
   inline void GenGaussian(double *dptr, size_t size, double mu, double sigma) {
     curandStatus_t status;
     status = curandGenerateNormalDouble(gen_, dptr, size, mu, sigma);
-    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen Uniform failed";
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen Normal double failed."
+                                            << " size = " << size
+                                            << ",mu = " << mu
+                                            << ",sigma = " << sigma;
   }
   inline void GenUniform(float *dptr, size_t size) {
     curandStatus_t status;
     status = curandGenerateUniform(gen_, dptr, size);
-    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen Uniform failed";
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen Uniform float failed."
+                                            << " size = " << size;
   }
   inline void GenUniform(double *dptr, size_t size) {
     curandStatus_t status;
     status = curandGenerateUniformDouble(gen_, dptr, size);
-    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen Uniform failed";
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen Uniform double failed."
+                                            << " size = " << size;
   }
   /*! \brief random numbeer generator */
   curandGenerator_t gen_;
@@ -361,7 +369,9 @@ template<typename DType>
 template<int dim>
 inline void Random<gpu, DType>::SampleGaussian(
     Tensor<gpu, dim, DType> *dst, DType mu, DType sigma) {
-  if (dst->CheckContiguous()) {
+  // We need to check whether the shape size is even since CuRand supports only normal distribution
+  // generation of even number of elements.
+  if (dst->CheckContiguous() && (dst->shape_.Size() % 2 == 0)) {
     this->GenGaussian(dst->dptr_, dst->shape_.Size(), mu, sigma);
   } else {
     *dst = this->gaussian(dst->shape_, mu, sigma);
diff --git a/mshadow/tensor_blob.h b/mshadow/tensor_blob.h
index d7fd8798d690..98c83f81f27c 100644
--- a/mshadow/tensor_blob.h
+++ b/mshadow/tensor_blob.h
@@ -317,7 +317,7 @@ struct TShape {
 inline std::ostream &operator<<(std::ostream &os, const TShape &shape) {
   os << '(';
   for (index_t i = 0; i < shape.ndim(); ++i) {
-    if (i != 0) os << ", ";
+    if (i != 0) os << ',';
     os << shape[i];
   }
   // python style tuple
diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h
index 4f8b472a9f57..cd6d3ba3ac01 100644
--- a/mshadow/tensor_cpu-inl.h
+++ b/mshadow/tensor_cpu-inl.h
@@ -35,12 +35,14 @@ inline void DeleteStream<cpu>(Stream<cpu> *stream) {
 
 template<int ndim>
 inline std::ostream &operator<<(std::ostream &os, const Shape<ndim> &shape) { // NOLINT(*)
-  os << "(";
+  os << '(';
   for (int i = 0; i < ndim; ++i) {
-    if (i != 0) os << ",";
+    if (i != 0) os << ',';
     os << shape[i];
   }
-  os << ")";
+  // python style tuple
+  if (ndim == 1) os << ',';
+  os << ')';
   return os;
 }