Merge remote-tracking branch 'offical/master' into int8_dataloader

anirudh2290 · Feb 13, 2019 · 2305e22 · 2305e22
2 parents cb91ee4 + ce031da
commit 2305e22
Show file tree

Hide file tree

Showing 40 changed files with 788 additions and 482 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -294,8 +294,8 @@ else()
   add_definitions(-DMXNET_USE_NCCL=0)
 endif()
 
+include(cmake/ChooseBlas.cmake)
 if(USE_CUDA AND FIRST_CUDA)
-  include(cmake/ChooseBlas.cmake)
   include(3rdparty/mshadow/cmake/Utils.cmake)
   include(cmake/FirstClassLangCuda.cmake)
   include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})

diff --git a/ci/docker/install/ubuntu_clojure.sh b/ci/docker/install/ubuntu_clojure.sh
@@ -27,3 +27,4 @@ echo 'Installing Clojure...'
 wget https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein
 chmod 775 lein
 sudo cp lein /usr/local/bin
+echo "Y" | sudo lein downgrade 2.8.3
diff --git a/ci/docker/install/ubuntu_core.sh b/ci/docker/install/ubuntu_core.sh
@@ -35,6 +35,7 @@ apt-get install -y \
     libatlas-base-dev \
     libcurl4-openssl-dev \
     libjemalloc-dev \
+    libhdf5-dev \
     liblapack-dev \
     libopenblas-dev \
     libopencv-dev \

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
@@ -953,6 +953,7 @@ unittest_ubuntu_cpu_julia() {
     export PATH="$1/bin:$PATH"
     export MXNET_HOME='/work/mxnet'
     export JULIA_DEPOT_PATH='/work/julia-depot'
+    export INTEGRATION_TEST=1
 
     julia -e 'using InteractiveUtils; versioninfo()'
 

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
@@ -1028,7 +1028,7 @@ def test_unix_r_gpu() {
 def test_unix_julia07_cpu() {
     return ['Julia 0.7: CPU': {
       node(NODE_LINUX_CPU) {
-        ws('workspace/ut-julia07-cpu') {
+        ws('workspace/ut-it-julia07-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('cpu', mx_lib)
             utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_julia07', false)
@@ -1041,7 +1041,7 @@ def test_unix_julia07_cpu() {
 def test_unix_julia10_cpu() {
     return ['Julia 1.0: CPU': {
       node(NODE_LINUX_CPU) {
-        ws('workspace/ut-julia10-cpu') {
+        ws('workspace/ut-it-julia10-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('cpu', mx_lib)
             utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_julia10', false)

diff --git a/docs/build_version_doc/setup_docs_ubuntu.sh b/docs/build_version_doc/setup_docs_ubuntu.sh
@@ -64,6 +64,7 @@ echo "Installing Clojure dependencies..."
 wget https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein
 chmod 775 lein
 sudo cp lein /usr/local/bin
+echo "Y" | sudo lein downgrade 2.8.3
 
 
 echo "Installing R dependencies..."

diff --git a/example/bayesian-methods/algos.py b/example/bayesian-methods/algos.py
@@ -14,13 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Create implementation of algorithms of HMC, stepHMC, SGD, SGLD and DistilledSGLD"""
 from __future__ import print_function
+import time
+import numpy
 import mxnet as mx
 import mxnet.ndarray as nd
-import time
-import logging
-from utils import *
+from utils import copy_param, get_executor, sample_test_regression, sample_test_acc
 
 
 def calc_potential(exe, params, label_name, noise_precision, prior_precision):
@@ -35,6 +35,7 @@ def calc_potential(exe, params, label_name, noise_precision, prior_precision):
 
 
 def calc_grad(exe, exe_grads, params, X, Y, label_name=None, outgrad_f=None):
+    """Calculate gradient"""
     exe.copy_params_from(params)
     exe.arg_dict['data'][:] = X
     if outgrad_f is None:
@@ -48,8 +49,8 @@ def calc_grad(exe, exe_grads, params, X, Y, label_name=None, outgrad_f=None):
         v.wait_to_read()
 
 
-def step_HMC(exe, exe_params, exe_grads, label_key, noise_precision, prior_precision, L=10,
-             eps=1E-6):
+def step_HMC(exe, exe_params, exe_grads, label_key, noise_precision, prior_precision, L=10, eps=1E-6):
+    """Generate the implementation of step HMC"""
     init_params = {k: v.copyto(v.context) for k, v in exe_params.items()}
     end_params = {k: v.copyto(v.context) for k, v in exe_params.items()}
     init_momentums = {k: mx.random.normal(0, 1, v.shape) for k, v in init_params.items()}
@@ -102,6 +103,7 @@ def step_HMC(exe, exe_params, exe_grads, label_key, noise_precision, prior_preci
 def HMC(sym, data_inputs, X, Y, X_test, Y_test, sample_num,
         initializer=None, noise_precision=1 / 9.0, prior_precision=0.1,
         learning_rate=1E-6, L=10, dev=mx.gpu()):
+    """Generate the implementation of HMC"""
     label_key = list(set(data_inputs.keys()) - set(['data']))[0]
     exe, exe_params, exe_grads, _ = get_executor(sym, dev, data_inputs, initializer)
     exe.arg_dict['data'][:] = X
@@ -134,6 +136,7 @@ def SGD(sym, data_inputs, X, Y, X_test, Y_test, total_iter_num,
         out_grad_f=None,
         initializer=None,
         minibatch_size=100, dev=mx.gpu()):
+    """Generate the implementation of SGD"""
     if out_grad_f is None:
         label_key = list(set(data_inputs.keys()) - set(['data']))[0]
     exe, params, params_grad, _ = get_executor(sym, dev, data_inputs, initializer)
@@ -173,6 +176,7 @@ def SGLD(sym, X, Y, X_test, Y_test, total_iter_num,
          initializer=None,
          minibatch_size=100, thin_interval=100, burn_in_iter_num=1000, task='classification',
          dev=mx.gpu()):
+    """Generate the implementation of SGLD"""
     if out_grad_f is None:
         label_key = list(set(data_inputs.keys()) - set(['data']))[0]
     exe, params, params_grad, _ = get_executor(sym, dev, data_inputs, initializer)
@@ -200,7 +204,7 @@ def SGLD(sym, X, Y, X_test, Y_test, total_iter_num,
         if i < burn_in_iter_num:
             continue
         else:
-            if 0 == (i - burn_in_iter_num) % thin_interval:
+            if (i - burn_in_iter_num) % thin_interval == 0:
                 if optimizer.lr_scheduler is not None:
                     lr = optimizer.lr_scheduler(optimizer.num_update)
                 else:
@@ -238,6 +242,7 @@ def DistilledSGLD(teacher_sym, student_sym,
                   minibatch_size=100,
                   task='classification',
                   dev=mx.gpu()):
+    """Generate the implementation of DistilledSGLD"""
     teacher_exe, teacher_params, teacher_params_grad, _ = \
         get_executor(teacher_sym, dev, teacher_data_inputs, teacher_initializer)
     student_exe, student_params, student_params_grad, _ = \
@@ -323,13 +328,14 @@ def DistilledSGLD(teacher_sym, student_sym,
                     sample_test_acc(teacher_exe, X=X, Y=Y, label_num=10,
                                     minibatch_size=minibatch_size)
                 print("Student: Test ACC %d/%d=%f, Train ACC %d/%d=%f" % (test_correct, test_total,
-                                                    test_acc, train_correct, train_total, train_acc))
+                                                                          test_acc, train_correct,
+                                                                          train_total, train_acc))
                 print("Teacher: Test ACC %d/%d=%f, Train ACC %d/%d=%f" \
                       % (teacher_test_correct, teacher_test_total, teacher_test_acc,
                          teacher_train_correct, teacher_train_total, teacher_train_acc))
             else:
                 print("Current Iter Num: %d" % (i + 1), "Time Spent: %f" % (end - start), "MSE:",
-                       sample_test_regression(exe=student_exe, X=X_test, Y=Y_test,
+                      sample_test_regression(exe=student_exe, X=X_test, Y=Y_test,
                                              minibatch_size=minibatch_size,
                                              save_path='regression_DSGLD.txt'))
             start = time.time()

diff --git a/example/bayesian-methods/bdk_demo.py b/example/bayesian-methods/bdk_demo.py
@@ -14,21 +14,21 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Run Stochastic Gradient Langevin Dynamics (SGLD) and Bayesian Dark Knowledge (BDK)"""
 from __future__ import print_function
-import mxnet as mx
-import mxnet.ndarray as nd
+import argparse
+import time
 import numpy
-import logging
 import matplotlib.pyplot as plt
-from scipy.stats import gaussian_kde
-import argparse
-from algos import *
-from data_loader import *
-from utils import *
+import mxnet as mx
+import mxnet.ndarray as nd
+from algos import HMC, SGD, SGLD, DistilledSGLD
+from data_loader import load_mnist, load_toy, load_synthetic
+from utils import BiasXavier, SGLDScheduler
 
 
 class CrossEntropySoftmax(mx.operator.NumpyOp):
+    """Calculate CrossEntropy softmax function"""
     def __init__(self):
         super(CrossEntropySoftmax, self).__init__(False)
 
@@ -58,6 +58,7 @@ def backward(self, out_grad, in_data, out_data, in_grad):
 
 
 class LogSoftmax(mx.operator.NumpyOp):
+    """Generate helper functions to evaluate softmax loss function"""
     def __init__(self):
         super(LogSoftmax, self).__init__(False)
 
@@ -103,6 +104,7 @@ def regression_student_grad(student_outputs, teacher_pred, teacher_noise_precisi
 
 
 def get_mnist_sym(output_op=None, num_hidden=400):
+    """Get symbol of mnist"""
     net = mx.symbol.Variable('data')
     net = mx.symbol.FullyConnected(data=net, name='mnist_fc1', num_hidden=num_hidden)
     net = mx.symbol.Activation(data=net, name='mnist_relu1', act_type="relu")
@@ -117,6 +119,7 @@ def get_mnist_sym(output_op=None, num_hidden=400):
 
 
 def synthetic_grad(X, theta, sigma1, sigma2, sigmax, rescale_grad=1.0, grad=None):
+    """Get synthetic gradient value"""
     if grad is None:
         grad = nd.empty(theta.shape, theta.context)
     theta1 = theta.asnumpy()[0]
@@ -128,17 +131,16 @@ def synthetic_grad(X, theta, sigma1, sigma2, sigmax, rescale_grad=1.0, grad=None
         -(X - theta1 - theta2) ** 2 / (2 * vx))
     grad_npy = numpy.zeros(theta.shape)
     grad_npy[0] = -rescale_grad * ((numpy.exp(-(X - theta1) ** 2 / (2 * vx)) * (X - theta1) / vx
-                                    + numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * (
-                                    X - theta1 - theta2) / vx) / denominator).sum() \
-                  + theta1 / v1
-    grad_npy[1] = -rescale_grad * ((numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * (
-    X - theta1 - theta2) / vx) / denominator).sum() \
-                  + theta2 / v2
+                                    + numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) *
+                                    (X - theta1 - theta2) / vx) / denominator).sum() + theta1 / v1
+    grad_npy[1] = -rescale_grad * ((numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) *
+                                    (X - theta1 - theta2) / vx) / denominator).sum() + theta2 / v2
     grad[:] = grad_npy
     return grad
 
 
 def get_toy_sym(teacher=True, teacher_noise_precision=None):
+    """Get toy symbol"""
     if teacher:
         net = mx.symbol.Variable('data')
         net = mx.symbol.FullyConnected(data=net, name='teacher_fc1', num_hidden=100)
@@ -160,8 +162,9 @@ def dev(gpu_id=None):
     return mx.gpu(gpu_id) if gpu_id else mx.cpu()
 
 
-def run_mnist_SGD(training_num=50000, gpu_id=None):
-    X, Y, X_test, Y_test = load_mnist(training_num)
+
+def run_mnist_SGD(num_training=50000, gpu_id=None):
+    X, Y, X_test, Y_test = load_mnist(num_training)
     minibatch_size = 100
     net = get_mnist_sym()
     data_shape = (minibatch_size,) + X.shape[1::]
@@ -175,8 +178,8 @@ def run_mnist_SGD(training_num=50000, gpu_id=None):
                              lr=5E-6, prior_precision=1.0, minibatch_size=100)
 
 
-def run_mnist_SGLD(training_num=50000, gpu_id=None):
-    X, Y, X_test, Y_test = load_mnist(training_num)
+def run_mnist_SGLD(num_training=50000, gpu_id=None):
+    X, Y, X_test, Y_test = load_mnist(num_training)
     minibatch_size = 100
     net = get_mnist_sym()
     data_shape = (minibatch_size,) + X.shape[1::]
@@ -191,10 +194,11 @@ def run_mnist_SGLD(training_num=50000, gpu_id=None):
                             thin_interval=100, burn_in_iter_num=1000)
 
 
-def run_mnist_DistilledSGLD(training_num=50000, gpu_id=None):
-    X, Y, X_test, Y_test = load_mnist(training_num)
+def run_mnist_DistilledSGLD(num_training=50000, gpu_id=None):
+    """Run DistilledSGLD on mnist dataset"""
+    X, Y, X_test, Y_test = load_mnist(num_training)
     minibatch_size = 100
-    if training_num >= 10000:
+    if num_training >= 10000:
         num_hidden = 800
         total_iter_num = 1000000
         teacher_learning_rate = 1E-6
@@ -235,6 +239,7 @@ def run_mnist_DistilledSGLD(training_num=50000, gpu_id=None):
 
 
 def run_toy_SGLD(gpu_id=None):
+    """Run SGLD on toy dataset"""
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = 1
     teacher_noise_precision = 1.0 / 9.0
@@ -243,20 +248,26 @@ def run_toy_SGLD(gpu_id=None):
     data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
                    'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))}
     initializer = mx.init.Uniform(0.07)
-    exe, params, _ = \
-        SGLD(sym=net, data_inputs=data_inputs,
-             X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=50000,
-             initializer=initializer,
-             learning_rate=1E-4,
-             #         lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5),
-             prior_precision=0.1,
-             burn_in_iter_num=1000,
-             thin_interval=10,
-             task='regression',
-             minibatch_size=minibatch_size, dev=dev(gpu_id))
-
-
-def run_toy_DistilledSGLD(gpu_id=None):
+    exe, params, _ = SGLD(sym=net,
+                          data_inputs=data_inputs,
+                          X=X,
+                          Y=Y,
+                          X_test=X_test,
+                          Y_test=Y_test,
+                          total_iter_num=50000,
+                          initializer=initializer,
+                          learning_rate=1E-4,
+                          # lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5),
+                          prior_precision=0.1,
+                          burn_in_iter_num=1000,
+                          thin_interval=10,
+                          task='regression',
+                          minibatch_size=minibatch_size,
+                          dev=dev(gpu_id))  # disable=unbalanced-tuple-unpacking
+
+
+def run_toy_DistilledSGLD(gpu_id):
+    """Run DistilledSGLD on toy dataset"""
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = 1
     teacher_noise_precision = 1.0
@@ -288,6 +299,7 @@ def run_toy_DistilledSGLD(gpu_id=None):
 
 
 def run_toy_HMC(gpu_id=None):
+    """Run HMC on toy dataset"""
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = Y.shape[0]
     noise_precision = 1 / 9.0
@@ -302,6 +314,7 @@ def run_toy_HMC(gpu_id=None):
 
 
 def run_synthetic_SGLD():
+    """Run synthetic SGLD"""
     theta1 = 0
     theta2 = 1
     sigma1 = numpy.sqrt(10)
@@ -322,14 +335,14 @@ def run_synthetic_SGLD():
     grad = nd.empty((2,), mx.cpu())
     samples = numpy.zeros((2, total_iter_num))
     start = time.time()
-    for i in xrange(total_iter_num):
+    for i in range(total_iter_num):
         if (i + 1) % 100000 == 0:
             end = time.time()
             print("Iter:%d, Time spent: %f" % (i + 1, end - start))
             start = time.time()
         ind = numpy.random.randint(0, X.shape[0])
-        synthetic_grad(X[ind], theta, sigma1, sigma2, sigmax, rescale_grad=
-        X.shape[0] / float(minibatch_size), grad=grad)
+        synthetic_grad(X[ind], theta, sigma1, sigma2, sigmax,
+                       rescale_grad=X.shape[0] / float(minibatch_size), grad=grad)
         updater('theta', grad, theta)
         samples[:, i] = theta.asnumpy()
     plt.hist2d(samples[0, :], samples[1, :], (200, 200), cmap=plt.cm.jet)
@@ -354,18 +367,18 @@ def run_synthetic_SGLD():
     args = parser.parse_args()
     training_num = args.training
     if args.dataset == 1:
-        if 0 == args.algorithm:
+        if args.algorithm == 0:
             run_mnist_SGD(training_num, gpu_id=args.gpu)
-        elif 1 == args.algorithm:
+        elif args.algorithm == 1:
             run_mnist_SGLD(training_num, gpu_id=args.gpu)
         else:
             run_mnist_DistilledSGLD(training_num, gpu_id=args.gpu)
     elif args.dataset == 0:
-        if 1 == args.algorithm:
+        if args.algorithm == 1:
             run_toy_SGLD(gpu_id=args.gpu)
-        elif 2 == args.algorithm:
+        elif args.algorithm == 2:
             run_toy_DistilledSGLD(gpu_id=args.gpu)
-        elif 3 == args.algorithm:
+        elif args.algorithm == 3:
             run_toy_HMC(gpu_id=args.gpu)
     else:
         run_synthetic_SGLD()
diff --git a/example/bayesian-methods/data_loader.py b/example/bayesian-methods/data_loader.py
@@ -14,14 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Create helper functions to load mnist dataset and toy dataset"""
 from __future__ import print_function
-import numpy
 import os
 import ssl
+import numpy
 
 
 def load_mnist(training_num=50000):
+    """Load mnist dataset"""
     data_path = os.path.join(os.path.dirname(os.path.realpath('__file__')), 'mnist.npz')
     if not os.path.isfile(data_path):
         from six.moves import urllib