Merge branch 'master' of https://github.com/apache/incubator-mxnet in…

…to develop/add-higher-order/sinh-cosh
apache · Jul 11, 2019 · cb051fe · cb051fe
2 parents a9ce8f3 + 68460a9
commit cb051fe
Show file tree

Hide file tree

Showing 77 changed files with 1,219 additions and 396 deletions.
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
diff --git a/benchmark/opperf/opperf.py b/benchmark/opperf/opperf.py
@@ -125,8 +125,9 @@ def main():
                              'output file.')
 
     args = parser.parse_args()
-    logging.info(f"Running MXNet operator benchmarks with the following options: {args}")
-    assert not os.path.isfile(args.output_file), f"Output file {args.output_file} already exists."
+    logging.info("Running MXNet operator benchmarks with the following options: {args}".format(args=args))
+    assert not os.path.isfile(args.output_file),\
+        "Output file {output_file} already exists.".format(output_file=args.output_file)
 
     # 2. RUN BENCHMARKS
     ctx = _parse_mxnet_context(args.ctx)
@@ -140,7 +141,7 @@ def main():
     # 4. Generate list of MXNet operators not covered in benchmarks
     ops_not_covered = get_operators_with_no_benchmark(final_benchmark_results.keys())
     for idx, op in enumerate(ops_not_covered):
-        print(f"{idx}. {op}")
+        print("{idx}. {op}".format(idx=idx, op=op))
 
     return 0
 

diff --git a/benchmark/opperf/utils/benchmark_utils.py b/benchmark/opperf/utils/benchmark_utils.py
@@ -55,14 +55,14 @@ def _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, kw
 
     # Run Benchmarks
     op_benchmark_result = {op.__name__: []}
-    logging.info(f"Begin Benchmark - {op.__name__}")
+    logging.info("Begin Benchmark - {name}".format(name=op.__name__))
     for idx, kwargs in enumerate(kwargs_list):
         _, profiler_output = benchmark_helper_func(op, runs, **kwargs)
 
         # Add inputs used for profiling this operator into result
         profiler_output["inputs"] = inputs[idx]
         op_benchmark_result[op.__name__].append(profiler_output)
-    logging.info(f"Complete Benchmark - {op.__name__}")
+    logging.info("Complete Benchmark - {name}".format(name=op.__name__))
     return op_benchmark_result
 
 

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
@@ -952,13 +952,6 @@ unittest_ubuntu_python3_quantization_gpu() {
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
 
-unittest_ubuntu_cpu_scala() {
-    set -ex
-    scala_prepare
-    cd scala-package
-    mvn -B integration-test
-}
-
 unittest_centos7_cpu_scala() {
     set -ex
     cd /work/mxnet
@@ -1156,12 +1149,19 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
     ../../tools/launch.py -n 3 --launcher local python test_server_profiling.py
 }
 
+integrationtest_ubuntu_cpu_scala() {
+    set -ex
+    scala_prepare
+    cd scala-package
+    mvn -B verify -DskipTests=false
+}
+
 integrationtest_ubuntu_gpu_scala() {
     set -ex
     scala_prepare
     cd scala-package
     export SCALA_TEST_ON_GPU=1
-    mvn -B integration-test -DskipTests=false
+    mvn -B verify -DskipTests=false
 }
 
 integrationtest_ubuntu_gpu_dist_kvstore() {

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
@@ -941,7 +941,7 @@ def test_unix_scala_cpu() {
         ws('workspace/ut-scala-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('cpu', mx_lib, true)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_scala', false)
+            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_scala', false)
             utils.publish_test_coverage()
           }
         }
@@ -955,7 +955,7 @@ def test_unix_scala_mkldnn_cpu(){
         ws('workspace/ut-scala-mkldnn-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('mkldnn_cpu', mx_mkldnn_lib, true)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_scala', false)
+            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_scala', false)
             utils.publish_test_coverage()
           }
         }

diff --git a/docs/install/build_from_source.md b/docs/install/build_from_source.md
@@ -123,8 +123,7 @@ You can set the BLAS library explicitly by setting the BLAS variable to:
 
 See the [cmake/ChooseBLAS.cmake](https://github.com/apache/incubator-mxnet/blob/master/cmake/ChooseBlas.cmake) file for the options.
 
-Intel's MKL (Math Kernel Library) is one of the most powerful math libraries
-https://software.intel.com/en-us/mkl
+[Intel's MKL (Math Kernel Library)](https://software.intel.com/en-us/mkl) is one of the most powerful math libraries
 
 It has following flavors:
 
@@ -144,6 +143,8 @@ shipped as a subrepo with MXNet source code (see 3rdparty/mkldnn or the [MKL-DNN
 Since the full MKL library is almost always faster than any other BLAS library it's turned on by default,
 however it needs to be downloaded and installed manually before doing `cmake` configuration.
 Register and download on the [Intel performance libraries website](https://software.intel.com/en-us/performance-libraries).
+You can also install MKL through [YUM](https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-yum-repo)
+or [APT](https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-apt-repo) Repository.
 
 Note: MKL is supported only for desktop builds and the framework itself supports the following
 hardware:

diff --git a/docs/tutorials/amp/amp_tutorial.md b/docs/tutorials/amp/amp_tutorial.md
@@ -17,7 +17,7 @@
 
 # Using AMP (Automatic Mixed Precision) in MXNet
 
-Training Deep Learning networks is a very computationally intensive task. Novel model architectures tend to have increasing number of layers and parameters, which slows down training. Fortunately, new generations of training hardware as well as software optimizations, make it a feasible task. 
+Training Deep Learning networks is a very computationally intensive task. Novel model architectures tend to have increasing number of layers and parameters, which slows down training. Fortunately, new generations of training hardware as well as software optimizations, make it a feasible task.
 
 However, where most of the (both hardware and software) optimization opportunities exists is in exploiting lower precision (like FP16) to, for example, utilize Tensor Cores available on new Volta and Turing GPUs. While training in FP16 showed great success in image classification tasks, other more complicated neural networks typically stayed in FP32 due to difficulties in applying the FP16 training guidelines.
 
@@ -253,7 +253,10 @@ We got 60% speed increase from 3 additional lines of code!
 
 ## Inference with AMP
 
-To do inference with mixed precision for a trained model in FP32, you can use the conversion APIs: `amp.convert_model` for symbolic model and `amp.convert_hybrid_block` for gluon models. The conversion APIs will take the FP32 model as input and will return a mixed precision model, which can be used to run inference. Below, we demonstrate for a gluon model and a symbolic model: 1. Conversion from FP32 model to mixed precision model 2. Run inference on the mixed precision model.
+To do inference with mixed precision for a trained model in FP32, you can use the conversion APIs: `amp.convert_model` for symbolic model and `amp.convert_hybrid_block` for gluon models. The conversion APIs will take the FP32 model as input and will return a mixed precision model, which can be used to run inference.
+Below, we demonstrate for a gluon model and a symbolic model:
+- Conversion from FP32 model to mixed precision model.
+- Run inference on the mixed precision model.
 
 ```python
 with mx.Context(mx.gpu(0)):
@@ -289,6 +292,48 @@ with mx.Context(mx.gpu(0)):
     print("Conversion and Inference completed successfully")
 ```
 
+You can also customize the operators to run in FP16 versus the operator to run in FP32 or to conditionally run in FP32.
+Also, you can force cast the params wherever possible to FP16. Below is an example which demonstrates both these use cases
+for symbolic model. You can do the same for gluon hybrid block with `amp.convert_hybrid_block` API, `cast_optional_params` flag.
+
+```python
+with mx.Context(mx.gpu(0)):
+    # Below is an example of converting a symbolic model to a mixed precision model
+    # with only Convolution op being force casted to FP16.
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    model_path = os.path.join(dir_path, 'model')
+    if not os.path.isdir(model_path):
+        os.mkdir(model_path)
+    prefix, epoch = mx.test_utils.download_model("imagenet1k-resnet-18", dst_dir=model_path)
+    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+
+    # All Convolution ops should run in FP16, SoftmaxOutput and FullyConnected should run in FP32
+    # cast_optional_params=True: Force cast params to FP16 wherever possible
+    result_sym, result_arg_params, result_aux_params = amp.convert_model(sym,
+                                                                         arg_params,
+                                                                         aux_params,
+                                                                         target_dtype_ops=["Convolution"],
+                                                                         fp32_ops=["SoftmaxOutput", "FullyConnected"],
+                                                                         cast_optional_params=True)
+
+    # Run dummy inference with the converted symbolic model
+    mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.current_context())
+    mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]])
+    mod.set_params(result_arg_params, result_aux_params)
+    mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))],
+                                label=[mx.nd.ones((1,))]))
+    mod.get_outputs()[0].wait_to_read()
+
+    # Assert that the params for conv are in FP16, this is because cast_optional_params is set to True
+    assert mod._arg_params["conv0_weight"].dtype == np.float16
+    # FullyConnected params stay in FP32
+    assert mod._arg_params["fc1_bias"].dtype == np.float32
+
+    print("Conversion and Inference completed successfully")
+
+    # Serialize AMP model and save to disk
+    mod.save_checkpoint("amp_tutorial_model", 0, remove_amp_cast=False)
+```
 
 
 ## Current limitations of AMP

diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md
@@ -339,9 +339,11 @@ with SummaryWriter(logdir='./logs/') as sw:
                 fake_image = generator(g_input)
 
                 sw.add_scalar(tag='Loss_D', value={'test':d_error_epoch.asscalar()/count}, global_step=counter)
-                sw.add_scalar(tag='Loss_G', value={'test':d_error_epoch.asscalar()/count}, global_step=counter)
+                sw.add_scalar(tag='Loss_G', value={'test':g_error_epoch.asscalar()/count}, global_step=counter)
                 sw.add_image(tag='data_image', image=((fake_image[0]+ 1.0) * 127.5).astype(np.uint8)  , global_step=counter)
                 sw.flush()
+
+            counter += 1
 
         discriminator.save_parameters("infogan_d_latest.params")
         generator.save_parameters("infogan_g_latest.params")

diff --git a/docs/tutorials/mkldnn/MKLDNN_README.md b/docs/tutorials/mkldnn/MKLDNN_README.md
@@ -214,7 +214,7 @@ With MKL BLAS, the performace is expected to furtherly improved with variable ra
 You can redistribute not only dynamic libraries but also headers, examples and static libraries on accepting the license [Intel Simplified license](https://software.intel.com/en-us/license/intel-simplified-software-license).
 Installing the full MKL installation enables MKL support for all operators under the linalg namespace.
 
-  1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl)
+  1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl) You can also install MKL through [YUM](https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-yum-repo) or [APT](https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-apt-repo) Repository.
 
   2. Run `make -j ${nproc} USE_BLAS=mkl`
 

diff --git a/docs/tutorials/mkldnn/operator_list.md b/docs/tutorials/mkldnn/operator_list.md
@@ -44,6 +44,8 @@ To help users understanding MKL-DNN backend better, the following table summariz
 | **elemwise_add**   | 1D-4D input                | Y                        | Y              | Y              |
 | **Concat**         | 1D-4D input                | Y                        | Y              | Y              |
 | **slice**          | 1D-4D input                | N                        | Y              | N              |
+| **Reshape**        | 1D-4D input                | N                        | Y              | N              |
+| **Flatten**        | 1D-4D input                | N                        | Y              | N              |
 | **Quantization**   | 1D-4D input                | N                        | N              | Y              |
 | **Dequantization** | 1D-4D input                | N                        | N              | Y              |
 | **Requantization** | 1D-4D input                | N                        | N              | Y              |

diff --git a/perl-package/AI-MXNet/lib/AI/MXNet.pm b/perl-package/AI-MXNet/lib/AI/MXNet.pm
@@ -19,105 +19,51 @@ package AI::MXNet;
 use v5.14.0;
 use strict;
 use warnings;
+use AI::MXNet::NS 'global';
 use AI::MXNet::Base;
-use AI::MXNet::Callback;
-use AI::MXNet::NDArray;
-use AI::MXNet::Symbol;
+use AI::MXNet::Callback 'callback';
+use AI::MXNet::NDArray qw(nd ndarray);
+use AI::MXNet::Context 'context';
+use AI::MXNet::Symbol qw(sym symbol);
 use AI::MXNet::Executor;
 use AI::MXNet::Executor::Group;
 use AI::MXNet::CudaModule;
-use AI::MXNet::Random;
-use AI::MXNet::Initializer;
-use AI::MXNet::Optimizer;
-use AI::MXNet::KVStore;
+use AI::MXNet::Random qw(rnd random);
+use AI::MXNet::Initializer qw(init initializer);
+use AI::MXNet::Optimizer qw(optimizer opt);
+use AI::MXNet::KVStore 'kv';
 use AI::MXNet::KVStoreServer;
-use AI::MXNet::IO;
-use AI::MXNet::Metric;
+use AI::MXNet::IO 'io';
+use AI::MXNet::Metric 'metric';
 use AI::MXNet::LRScheduler;
-use AI::MXNet::Monitor;
+use AI::MXNet::Monitor 'mon';
 use AI::MXNet::Profiler;
 use AI::MXNet::Module::Base;
-use AI::MXNet::Module;
+use AI::MXNet::Module qw(mod module);
 use AI::MXNet::Module::Bucketing;
-use AI::MXNet::RNN;
-use AI::MXNet::Visualization;
-use AI::MXNet::RecordIO;
-use AI::MXNet::Image;
-use AI::MXNet::Contrib;
-use AI::MXNet::LinAlg;
+use AI::MXNet::RNN 'rnn';
+use AI::MXNet::Visualization 'viz';
+use AI::MXNet::RecordIO 'recordio';
+use AI::MXNet::Image qw(img image);
+use AI::MXNet::Contrib 'contrib';
+use AI::MXNet::LinAlg 'linalg';
 use AI::MXNet::CachedOp;
-use AI::MXNet::AutoGrad;
-use AI::MXNet::Gluon;
+use AI::MXNet::AutoGrad 'autograd';
+use AI::MXNet::Gluon 'gluon';
 use AI::MXNet::NDArray::Sparse;
 use AI::MXNet::Symbol::Sparse;
-use AI::MXNet::Engine;
+use AI::MXNet::Engine 'engine';
 our $VERSION = '1.4';
 
-sub import
-{
-    my ($class, $short_name) = @_;
-    if($short_name)
-    {
-        $short_name =~ s/[^\w:]//g;
-        if(length $short_name)
-        {
-            my $short_name_package =<<"EOP";
-            package $short_name;
-            no warnings 'redefine';
-            sub nd { 'AI::MXNet::NDArray' }
-            sub ndarray { 'AI::MXNet::NDArray' }
-            sub sym { 'AI::MXNet::Symbol' }
-            sub symbol { 'AI::MXNet::Symbol' }
-            sub init { 'AI::MXNet::Initializer' }
-            sub initializer { 'AI::MXNet::Initializer' }
-            sub optimizer { 'AI::MXNet::Optimizer' }
-            sub opt { 'AI::MXNet::Optimizer' }
-            sub rnd { 'AI::MXNet::Random' }
-            sub random { 'AI::MXNet::Random' }
-            sub Context { shift; AI::MXNet::Context->new(\@_) }
-            sub context { 'AI::MXNet::Context' }
-            sub cpu { AI::MXNet::Context->cpu(\$_[1]//0) }
-            sub cpu_pinned { AI::MXNet::Context->cpu_pinned(\$_[1]//0) }
-            sub gpu { AI::MXNet::Context->gpu(\$_[1]//0) }
-            sub kv { 'AI::MXNet::KVStore' }
-            sub recordio { 'AI::MXNet::RecordIO' }
-            sub io { 'AI::MXNet::IO' }
-            sub metric { 'AI::MXNet::Metric' }
-            sub mod { 'AI::MXNet::Module' }
-            sub module { 'AI::MXNet::Module' }
-            sub mon { 'AI::MXNet::Monitor' }
-            sub viz { 'AI::MXNet::Visualization' }
-            sub rnn { 'AI::MXNet::RNN' }
-            sub callback { 'AI::MXNet::Callback' }
-            sub img { 'AI::MXNet::Image' }
-            sub image { 'AI::MXNet::Image' }
-            sub contrib { 'AI::MXNet::Contrib' }
-            sub linalg { 'AI::MXNet::LinAlg' }
-            sub autograd { 'AI::MXNet::AutoGrad' }
-            sub engine { 'AI::MXNet::Engine' }
-            sub name { '$short_name' }
-            sub rtc { '$short_name' }
-            sub gluon { 'AI::MXNet::Gluon' }
-            sub CudaModule { shift; AI::MXNet::CudaModule->new(\@_) }
-            sub AttrScope { shift; AI::MXNet::Symbol::AttrScope->new(\@_) }
-            *AI::MXNet::Symbol::AttrScope::current = sub { \$${short_name}::AttrScope; };
-            \$${short_name}::AttrScope = AI::MXNet::Symbol::AttrScope->new;
-            sub Prefix { AI::MXNet::Symbol::Prefix->new(prefix => \$_[1]) }
-            *AI::MXNet::Symbol::NameManager::current = sub { \$${short_name}::NameManager; };
-            *AI::MXNet::Symbol::NameManager::set_current = sub { \$${short_name}::NameManager = \$_[1]; };
-            \$${short_name}::NameManager = AI::MXNet::Symbol::NameManager->new;
-            *AI::MXNet::Context::current_ctx = sub { \$${short_name}::Context; };
-            *AI::MXNet::Context::current_context = sub { \$${short_name}::Context; };
-            *AI::MXNet::Context::set_current = sub { \$${short_name}::Context = \$_[1]; };
-            \$${short_name}::Context = AI::MXNet::Context->new(device_type => 'cpu', device_id => 0);
-            package nd;
-            \@nd::ISA = ('AI::MXNet::NDArray');
-            1;
-EOP
-            eval $short_name_package;
-        }
-    }
-}
+sub cpu { AI::MXNet::Context->cpu($_[1]//0) }
+sub cpu_pinned { AI::MXNet::Context->cpu_pinned($_[1]//0) }
+sub gpu { AI::MXNet::Context->gpu($_[1]//0) }
+sub name { __PACKAGE__ }
+sub rtc { __PACKAGE__ }
+sub Prefix { AI::MXNet::Symbol::Prefix->new(prefix => $_[1]) }
+our $AttrScope = AI::MXNet::Symbol::AttrScope->new;
+our $NameManager = AI::MXNet::Symbol::NameManager->new;
+our $Context = AI::MXNet::Context->new(device_type => 'cpu', device_id => 0);
 
 1;
 __END__

diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm b/perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm
@@ -18,29 +18,12 @@
 package AI::MXNet::AutoGrad;
 use strict;
 use warnings;
+use AI::MXNet::NS 'global';
 use AI::MXNet::Base;
 use AI::MXNet::Function::Parameters;
 use Scalar::Util qw(blessed);
 use Carp qw(confess);
 
-sub import
-{
-    my ($class, $short_name) = @_;
-    if($short_name)
-    {
-        $short_name =~ s/[^\w:]//g;
-        if(length $short_name)
-        {
-            my $short_name_package =<<"EOP";
-            package $short_name;
-            use parent 'AI::MXNet::AutoGrad';
-            1;
-EOP
-            eval $short_name_package;
-        }
-    }
-}
-
 =head1 NAME
 
     AI::MXNet::AutoGrad - Autograd for NDArray.