diff --git a/ci/docker/install/centos7_python.sh b/ci/docker/install/centos7_python.sh
index 8521cde1acca..686cf14e2e3b 100755
--- a/ci/docker/install/centos7_python.sh
+++ b/ci/docker/install/centos7_python.sh
@@ -31,5 +31,5 @@ curl "https://bootstrap.pypa.io/get-pip.py" -o "get-pip.py"
 python2.7 get-pip.py
 python3.6 get-pip.py
 
-pip2 install nose pylint numpy nose-timer requests h5py scipy==1.0.1
-pip3 install nose pylint numpy nose-timer requests h5py scipy==1.0.1
+pip2 install nose pylint numpy nose-timer requests h5py scipy==1.2.1 decorator==4.4.0
+pip3 install nose pylint numpy nose-timer requests h5py scipy==1.2.1 decorator==4.4.0
diff --git a/ci/docker/install/centos7_scala.sh b/ci/docker/install/centos7_scala.sh
index 5a1c4163a031..df0d7a152bb1 100755
--- a/ci/docker/install/centos7_scala.sh
+++ b/ci/docker/install/centos7_scala.sh
@@ -27,8 +27,8 @@ export JAVA_HOME=/usr/lib/jvm/jre-1.8.0-openjdk
 export PATH=$JAVA_HOME/bin:$PATH
 
 # Build from source with Maven
-curl -o apache-maven-3.3.9-bin.tar.gz http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz \
-    || curl -o apache-maven-3.3.9-bin.tar.gz https://search.maven.org/remotecontent?filepath=org/apache/maven/apache-maven/3.3.9/apache-maven-3.3.9-bin.tar.gz
+curl -o apache-maven-3.3.9-bin.tar.gz -L http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz \
+    || curl -o apache-maven-3.3.9-bin.tar.gz -L https://search.maven.org/remotecontent?filepath=org/apache/maven/apache-maven/3.3.9/apache-maven-3.3.9-bin.tar.gz
 
 tar xzf apache-maven-3.3.9-bin.tar.gz
 mkdir /usr/local/maven
diff --git a/ci/docker/install/requirements b/ci/docker/install/requirements
index fd716f5fa815..61c9ef870504 100644
--- a/ci/docker/install/requirements
+++ b/ci/docker/install/requirements
@@ -21,6 +21,7 @@
 boto3==1.9.229
 cpplint==1.3.0
 Cython==0.29.7
+CommonMark==0.5.4
 decorator==4.4.0
 h5py==2.8.0rc1
 mock==2.0.0
diff --git a/ci/docker/install/ubuntu_onnx.sh b/ci/docker/install/ubuntu_onnx.sh
index 307028925b6e..1a220c7149b2 100755
--- a/ci/docker/install/ubuntu_onnx.sh
+++ b/ci/docker/install/ubuntu_onnx.sh
@@ -31,5 +31,5 @@ apt-get update || true
 apt-get install -y libprotobuf-dev protobuf-compiler
 
 echo "Installing pytest, pytest-cov, protobuf, Pillow, ONNX and tabulate ..."
-pip2 install pytest==3.6.3 pytest-cov==2.5.1 protobuf==3.5.2 onnx==1.3.0 Pillow==5.0.0 tabulate==0.7.5
-pip3 install pytest==3.6.3 pytest-cov==2.5.1 protobuf==3.5.2 onnx==1.3.0 Pillow==5.0.0 tabulate==0.7.5
+pip2 install pytest==3.6.3 pytest-cov==2.5.1 protobuf==3.5.2 onnx==1.3.0 Pillow==5.0.0 tabulate==0.7.5 attrs==19.1.0
+pip3 install pytest==3.6.3 pytest-cov==2.5.1 protobuf==3.5.2 onnx==1.3.0 Pillow==5.0.0 tabulate==0.7.5 attrs==19.1.0
diff --git a/ci/docker/install/ubuntu_publish.sh b/ci/docker/install/ubuntu_publish.sh
index 65982eead389..c3517e25d50d 100755
--- a/ci/docker/install/ubuntu_publish.sh
+++ b/ci/docker/install/ubuntu_publish.sh
@@ -48,8 +48,8 @@ apt-get install -y git \
     pkg-config \
     openjdk-8-jdk
 
-curl -o apache-maven-3.3.9-bin.tar.gz http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz \
-    || curl -o apache-maven-3.3.9-bin.tar.gz https://search.maven.org/remotecontent?filepath=org/apache/maven/apache-maven/3.3.9/apache-maven-3.3.9-bin.tar.gz
+curl -o apache-maven-3.3.9-bin.tar.gz -L http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz \
+    || curl -o apache-maven-3.3.9-bin.tar.gz -L https://search.maven.org/remotecontent?filepath=org/apache/maven/apache-maven/3.3.9/apache-maven-3.3.9-bin.tar.gz
 
 tar xzf apache-maven-3.3.9-bin.tar.gz
 mkdir /usr/local/maven
@@ -57,14 +57,13 @@ mv apache-maven-3.3.9/ /usr/local/maven/
 update-alternatives --install /usr/bin/mvn mvn /usr/local/maven/apache-maven-3.3.9/bin/mvn 1
 update-ca-certificates -f
 
-apt-get install -y python python3
+apt-get install -y python python-pip python3 python3-pip
 
 # the version of the pip shipped with ubuntu may be too lower, install a recent version here
-wget -nv https://bootstrap.pypa.io/get-pip.py
-python3 get-pip.py
-python2 get-pip.py
+# Restrict pip version to <19 due to use of Python 3.4 on Ubuntu 14.04
+python2 -m pip install --upgrade 'pip<19'
+python3 -m pip install --upgrade 'pip<19'
 
-apt-get remove -y python3-urllib3
-
-pip2 install nose cpplint==1.3.0 'numpy>1.16.0,<2.0.0' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
-pip3 install nose cpplint==1.3.0 pylint==2.3.1 'numpy>1.16.0,<2.0.0' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+# Restrict numpy version to <1.18 due to use of Python 3.4 on Ubuntu 14.04
+python2 -m pip install --upgrade --ignore-installed nose cpplint==1.3.0 'numpy>1.16.0,<1.17' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+python3 -m pip install --upgrade --ignore-installed nose cpplint==1.3.0 pylint==2.3.1 'numpy>1.16.0,<1.18' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
diff --git a/ci/docker/install/ubuntu_scala.sh b/ci/docker/install/ubuntu_scala.sh
index 9115bbc8a90c..d223b8e173ae 100755
--- a/ci/docker/install/ubuntu_scala.sh
+++ b/ci/docker/install/ubuntu_scala.sh
@@ -40,8 +40,8 @@ apt-get install -y \
 
 # Ubuntu 14.04
 if [[ $(lsb_release -r | grep 14.04) ]]; then
-    curl -o apache-maven-3.3.9-bin.tar.gz http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz \
-        || curl -o apache-maven-3.3.9-bin.tar.gz https://search.maven.org/remotecontent?filepath=org/apache/maven/apache-maven/3.3.9/apache-maven-3.3.9-bin.tar.gz
+    curl -o apache-maven-3.3.9-bin.tar.gz -L http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz \
+        || curl -o apache-maven-3.3.9-bin.tar.gz -L https://search.maven.org/remotecontent?filepath=org/apache/maven/apache-maven/3.3.9/apache-maven-3.3.9-bin.tar.gz
 
     tar xzf apache-maven-3.3.9-bin.tar.gz
     mkdir /usr/local/maven
diff --git a/docs/python_docs/environment.yml b/docs/python_docs/environment.yml
index 5f66d7715af9..7d9d42d3d662 100644
--- a/docs/python_docs/environment.yml
+++ b/docs/python_docs/environment.yml
@@ -23,7 +23,7 @@ dependencies:
 - pip
 - python
 - jupyter
-- sphinx
+- sphinx==2.2.2
 - matplotlib
 - notebook
 - pip:
diff --git a/julia/docs/Makefile b/julia/docs/Makefile
index e42b8cdccb93..66f36df9e0e0 100644
--- a/julia/docs/Makefile
+++ b/julia/docs/Makefile
@@ -20,8 +20,17 @@ all:
 	  'using Pkg; \
 	   Pkg.develop(PackageSpec(name="MXNet", path = joinpath(pwd(), "..")))'
 	julia --color=yes --project=./ ./make.jl
-	pip install --user pygments mkdocs mkdocs-material python-markdown-math
-	~/.local/bin/mkdocs build
+	pip install --user Markdown==3.1 \
+    mkdocs==1.0.4 \
+		mkdocs-material==4.6.0 \
+	  pygments==2.5.2 \
+	  pymdown-extensions==6.2.1 \
+	  python-markdown-math==0.6
+	export LC_ALL="C.UTF-8"
+	export LANG="C.UTF-8"
+	echo $(LC_ALL)
+	echo $(LANG)
+	LC_ALL="C.UTF-8" ~/.local/bin/mkdocs build
 
 clean:
 	rm -rvf venv
diff --git a/src/operator/contrib/transformer.cu b/src/operator/contrib/transformer.cu
index e152669478dd..59029eae65c2 100644
--- a/src/operator/contrib/transformer.cu
+++ b/src/operator/contrib/transformer.cu
@@ -50,18 +50,65 @@ void CublasStridedBatchedGemm(mshadow::Stream<gpu>* s, bool transA, bool transB,
       << "Must init CuBLAS handle in stream";
 
   cublasHandle_t blas_handle = mshadow::Stream<gpu>::GetBlasHandle(s);
-  auto err = CUBLAS_STATUS_SUCCESS;
-  // TODO(cfujitsang): handle computation_precision
-  err = cublasGemmStridedBatchedEx(
-      blas_handle, CublasTransposeOp(transA), CublasTransposeOp(transB),
-      static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
-      reinterpret_cast<void*>(&alpha),
-      a, CublasType<DType>::kCudaFlag, static_cast<int>(lda), strideA,
-      b, CublasType<DType>::kCudaFlag, static_cast<int>(ldb), strideB,
-      reinterpret_cast<void*>(&beta),
-      c, CublasType<DType>::kCudaFlag, static_cast<int>(ldc), strideC,
-      static_cast<int>(batchCount), CUDA_R_32F, algo);
-  CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas gemmEx fail.";
+  // cublasGemmStridedBatchedEx is only supported for GPU with architecture
+  // capabilities equal or greater than 5.0. Fall back to
+  // cublasSgemmStridedBatched, which doesn't support implicit conversion
+  // to half-precision to use TensorCores
+  auto cc_major = (s->prop).major;
+  if (cc_major >= 5) {
+    CUBLAS_CALL(cublasGemmStridedBatchedEx(
+        blas_handle, CublasTransposeOp(transA), CublasTransposeOp(transB),
+        static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
+        reinterpret_cast<void*>(&alpha),
+        a, CublasType<DType>::kCudaFlag, static_cast<int>(lda), strideA,
+        b, CublasType<DType>::kCudaFlag, static_cast<int>(ldb), strideB,
+        reinterpret_cast<void*>(&beta),
+        c, CublasType<DType>::kCudaFlag, static_cast<int>(ldc), strideC,
+        static_cast<int>(batchCount), CUDA_R_32F, algo));
+  } else {
+    if (std::is_same<DType, float>::value) {
+      CUBLAS_CALL(cublasSgemmStridedBatched(
+          blas_handle, CublasTransposeOp(transA), CublasTransposeOp(transB),
+          static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
+          reinterpret_cast<float*>(&alpha),
+          reinterpret_cast<const float*>(a),
+          static_cast<int>(lda), strideA,
+          reinterpret_cast<const float*>(b),
+          static_cast<int>(ldb), strideB,
+          reinterpret_cast<float*>(&beta),
+          reinterpret_cast<float*>(c),
+          static_cast<int>(ldc), strideC,
+          static_cast<int>(batchCount)));
+    } else if (std::is_same<DType, double>::value) {
+      CUBLAS_CALL(cublasDgemmStridedBatched(
+          blas_handle, CublasTransposeOp(transA), CublasTransposeOp(transB),
+          static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
+          reinterpret_cast<double*>(&alpha),
+          reinterpret_cast<const double*>(a),
+          static_cast<int>(lda), strideA,
+          reinterpret_cast<const double*>(b),
+          static_cast<int>(ldb), strideB,
+          reinterpret_cast<double*>(&beta),
+          reinterpret_cast<double*>(c),
+          static_cast<int>(ldc), strideC,
+          static_cast<int>(batchCount)));
+    } else if (std::is_same<DType, mshadow::half::half_t>::value) {
+      CUBLAS_CALL(cublasHgemmStridedBatched(
+          blas_handle, CublasTransposeOp(transA), CublasTransposeOp(transB),
+          static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
+          reinterpret_cast<__half*>(&alpha),
+          reinterpret_cast<const __half*>(a),
+          static_cast<int>(lda), strideA,
+          reinterpret_cast<const __half*>(b),
+          static_cast<int>(ldb), strideB,
+          reinterpret_cast<__half*>(&beta),
+          reinterpret_cast<__half*>(c),
+          static_cast<int>(ldc), strideC,
+          static_cast<int>(batchCount)));
+    } else {
+      LOG(FATAL) << "Unsupported DType in CublasStridedBatchedGemm.";
+    }
+  }
 #else
   LOG(FATAL) << "Not implemented with CUDA < 9.1";
 #endif
diff --git a/tools/caffe_converter/caffe_proto_utils.py b/tools/caffe_converter/caffe_proto_utils.py
index 8d6183457637..54cd952d4443 100644
--- a/tools/caffe_converter/caffe_proto_utils.py
+++ b/tools/caffe_converter/caffe_proto_utils.py
@@ -196,7 +196,7 @@ def read_caffe_mean(caffe_mean_file):
         mean_blob.ParseFromString(f.read())
 
     img_mean_np = np.array(mean_blob.data)
-    img_mean_np = img_mean_np.reshape(mean_blob.channels, mean_blob.height, mean_blob.width)
+    img_mean_np = img_mean_np.reshape(mean_blob.channels, mean_blob.height, mean_blob.width)  # pylint: disable=too-many-function-args
 
     # swap channels from Caffe BGR to RGB
     img_mean_np[[0, 2], :, :] = img_mean_np[[2, 0], :, :]
diff --git a/tools/caffe_converter/convert_mean.py b/tools/caffe_converter/convert_mean.py
index 1a3df71d3e54..1debfe5a2ef3 100644
--- a/tools/caffe_converter/convert_mean.py
+++ b/tools/caffe_converter/convert_mean.py
@@ -42,7 +42,7 @@ def convert_mean(binaryproto_fname, output=None):
         mean_blob.ParseFromString(f.read())
 
     img_mean_np = np.array(mean_blob.data)
-    img_mean_np = img_mean_np.reshape(
+    img_mean_np = img_mean_np.reshape(  # pylint: disable=too-many-function-args
         mean_blob.channels, mean_blob.height, mean_blob.width
     )
     # swap channels from Caffe BGR to RGB