Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
Browse files Browse the repository at this point in the history
…to develop/add-higher-order/sinh-cosh
  • Loading branch information
kshitij12345 committed Jul 11, 2019
2 parents a9ce8f3 + 68460a9 commit cb051fe
Show file tree
Hide file tree
Showing 77 changed files with 1,219 additions and 396 deletions.
2 changes: 1 addition & 1 deletion 3rdparty/mkldnn
7 changes: 4 additions & 3 deletions benchmark/opperf/opperf.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,9 @@ def main():
'output file.')

args = parser.parse_args()
logging.info(f"Running MXNet operator benchmarks with the following options: {args}")
assert not os.path.isfile(args.output_file), f"Output file {args.output_file} already exists."
logging.info("Running MXNet operator benchmarks with the following options: {args}".format(args=args))
assert not os.path.isfile(args.output_file),\
"Output file {output_file} already exists.".format(output_file=args.output_file)

# 2. RUN BENCHMARKS
ctx = _parse_mxnet_context(args.ctx)
Expand All @@ -140,7 +141,7 @@ def main():
# 4. Generate list of MXNet operators not covered in benchmarks
ops_not_covered = get_operators_with_no_benchmark(final_benchmark_results.keys())
for idx, op in enumerate(ops_not_covered):
print(f"{idx}. {op}")
print("{idx}. {op}".format(idx=idx, op=op))

return 0

Expand Down
4 changes: 2 additions & 2 deletions benchmark/opperf/utils/benchmark_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,14 @@ def _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, kw

# Run Benchmarks
op_benchmark_result = {op.__name__: []}
logging.info(f"Begin Benchmark - {op.__name__}")
logging.info("Begin Benchmark - {name}".format(name=op.__name__))
for idx, kwargs in enumerate(kwargs_list):
_, profiler_output = benchmark_helper_func(op, runs, **kwargs)

# Add inputs used for profiling this operator into result
profiler_output["inputs"] = inputs[idx]
op_benchmark_result[op.__name__].append(profiler_output)
logging.info(f"Complete Benchmark - {op.__name__}")
logging.info("Complete Benchmark - {name}".format(name=op.__name__))
return op_benchmark_result


Expand Down
16 changes: 8 additions & 8 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -952,13 +952,6 @@ unittest_ubuntu_python3_quantization_gpu() {
nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
}

unittest_ubuntu_cpu_scala() {
set -ex
scala_prepare
cd scala-package
mvn -B integration-test
}

unittest_centos7_cpu_scala() {
set -ex
cd /work/mxnet
Expand Down Expand Up @@ -1156,12 +1149,19 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
../../tools/launch.py -n 3 --launcher local python test_server_profiling.py
}

integrationtest_ubuntu_cpu_scala() {
set -ex
scala_prepare
cd scala-package
mvn -B verify -DskipTests=false
}

integrationtest_ubuntu_gpu_scala() {
set -ex
scala_prepare
cd scala-package
export SCALA_TEST_ON_GPU=1
mvn -B integration-test -DskipTests=false
mvn -B verify -DskipTests=false
}

integrationtest_ubuntu_gpu_dist_kvstore() {
Expand Down
4 changes: 2 additions & 2 deletions ci/jenkins/Jenkins_steps.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -941,7 +941,7 @@ def test_unix_scala_cpu() {
ws('workspace/ut-scala-cpu') {
timeout(time: max_time, unit: 'MINUTES') {
utils.unpack_and_init('cpu', mx_lib, true)
utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_scala', false)
utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_scala', false)
utils.publish_test_coverage()
}
}
Expand All @@ -955,7 +955,7 @@ def test_unix_scala_mkldnn_cpu(){
ws('workspace/ut-scala-mkldnn-cpu') {
timeout(time: max_time, unit: 'MINUTES') {
utils.unpack_and_init('mkldnn_cpu', mx_mkldnn_lib, true)
utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_scala', false)
utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_scala', false)
utils.publish_test_coverage()
}
}
Expand Down
5 changes: 3 additions & 2 deletions docs/install/build_from_source.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,7 @@ You can set the BLAS library explicitly by setting the BLAS variable to:

See the [cmake/ChooseBLAS.cmake](https://github.com/apache/incubator-mxnet/blob/master/cmake/ChooseBlas.cmake) file for the options.

Intel's MKL (Math Kernel Library) is one of the most powerful math libraries
https://software.intel.com/en-us/mkl
[Intel's MKL (Math Kernel Library)](https://software.intel.com/en-us/mkl) is one of the most powerful math libraries

It has following flavors:

Expand All @@ -144,6 +143,8 @@ shipped as a subrepo with MXNet source code (see 3rdparty/mkldnn or the [MKL-DNN
Since the full MKL library is almost always faster than any other BLAS library it's turned on by default,
however it needs to be downloaded and installed manually before doing `cmake` configuration.
Register and download on the [Intel performance libraries website](https://software.intel.com/en-us/performance-libraries).
You can also install MKL through [YUM](https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-yum-repo)
or [APT](https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-apt-repo) Repository.

Note: MKL is supported only for desktop builds and the framework itself supports the following
hardware:
Expand Down
49 changes: 47 additions & 2 deletions docs/tutorials/amp/amp_tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

# Using AMP (Automatic Mixed Precision) in MXNet

Training Deep Learning networks is a very computationally intensive task. Novel model architectures tend to have increasing number of layers and parameters, which slows down training. Fortunately, new generations of training hardware as well as software optimizations, make it a feasible task.
Training Deep Learning networks is a very computationally intensive task. Novel model architectures tend to have increasing number of layers and parameters, which slows down training. Fortunately, new generations of training hardware as well as software optimizations, make it a feasible task.

However, where most of the (both hardware and software) optimization opportunities exists is in exploiting lower precision (like FP16) to, for example, utilize Tensor Cores available on new Volta and Turing GPUs. While training in FP16 showed great success in image classification tasks, other more complicated neural networks typically stayed in FP32 due to difficulties in applying the FP16 training guidelines.

Expand Down Expand Up @@ -253,7 +253,10 @@ We got 60% speed increase from 3 additional lines of code!

## Inference with AMP

To do inference with mixed precision for a trained model in FP32, you can use the conversion APIs: `amp.convert_model` for symbolic model and `amp.convert_hybrid_block` for gluon models. The conversion APIs will take the FP32 model as input and will return a mixed precision model, which can be used to run inference. Below, we demonstrate for a gluon model and a symbolic model: 1. Conversion from FP32 model to mixed precision model 2. Run inference on the mixed precision model.
To do inference with mixed precision for a trained model in FP32, you can use the conversion APIs: `amp.convert_model` for symbolic model and `amp.convert_hybrid_block` for gluon models. The conversion APIs will take the FP32 model as input and will return a mixed precision model, which can be used to run inference.
Below, we demonstrate for a gluon model and a symbolic model:
- Conversion from FP32 model to mixed precision model.
- Run inference on the mixed precision model.

```python
with mx.Context(mx.gpu(0)):
Expand Down Expand Up @@ -289,6 +292,48 @@ with mx.Context(mx.gpu(0)):
print("Conversion and Inference completed successfully")
```

You can also customize the operators to run in FP16 versus the operator to run in FP32 or to conditionally run in FP32.
Also, you can force cast the params wherever possible to FP16. Below is an example which demonstrates both these use cases
for symbolic model. You can do the same for gluon hybrid block with `amp.convert_hybrid_block` API, `cast_optional_params` flag.

```python
with mx.Context(mx.gpu(0)):
# Below is an example of converting a symbolic model to a mixed precision model
# with only Convolution op being force casted to FP16.
dir_path = os.path.dirname(os.path.realpath(__file__))
model_path = os.path.join(dir_path, 'model')
if not os.path.isdir(model_path):
os.mkdir(model_path)
prefix, epoch = mx.test_utils.download_model("imagenet1k-resnet-18", dst_dir=model_path)
sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)

# All Convolution ops should run in FP16, SoftmaxOutput and FullyConnected should run in FP32
# cast_optional_params=True: Force cast params to FP16 wherever possible
result_sym, result_arg_params, result_aux_params = amp.convert_model(sym,
arg_params,
aux_params,
target_dtype_ops=["Convolution"],
fp32_ops=["SoftmaxOutput", "FullyConnected"],
cast_optional_params=True)

# Run dummy inference with the converted symbolic model
mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.current_context())
mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]])
mod.set_params(result_arg_params, result_aux_params)
mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))],
label=[mx.nd.ones((1,))]))
mod.get_outputs()[0].wait_to_read()

# Assert that the params for conv are in FP16, this is because cast_optional_params is set to True
assert mod._arg_params["conv0_weight"].dtype == np.float16
# FullyConnected params stay in FP32
assert mod._arg_params["fc1_bias"].dtype == np.float32

print("Conversion and Inference completed successfully")

# Serialize AMP model and save to disk
mod.save_checkpoint("amp_tutorial_model", 0, remove_amp_cast=False)
```


## Current limitations of AMP
Expand Down
4 changes: 3 additions & 1 deletion docs/tutorials/gluon/info_gan.md
Original file line number Diff line number Diff line change
Expand Up @@ -339,9 +339,11 @@ with SummaryWriter(logdir='./logs/') as sw:
fake_image = generator(g_input)

sw.add_scalar(tag='Loss_D', value={'test':d_error_epoch.asscalar()/count}, global_step=counter)
sw.add_scalar(tag='Loss_G', value={'test':d_error_epoch.asscalar()/count}, global_step=counter)
sw.add_scalar(tag='Loss_G', value={'test':g_error_epoch.asscalar()/count}, global_step=counter)
sw.add_image(tag='data_image', image=((fake_image[0]+ 1.0) * 127.5).astype(np.uint8) , global_step=counter)
sw.flush()

counter += 1

discriminator.save_parameters("infogan_d_latest.params")
generator.save_parameters("infogan_g_latest.params")
Expand Down
2 changes: 1 addition & 1 deletion docs/tutorials/mkldnn/MKLDNN_README.md
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ With MKL BLAS, the performace is expected to furtherly improved with variable ra
You can redistribute not only dynamic libraries but also headers, examples and static libraries on accepting the license [Intel Simplified license](https://software.intel.com/en-us/license/intel-simplified-software-license).
Installing the full MKL installation enables MKL support for all operators under the linalg namespace.

1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl)
1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl) You can also install MKL through [YUM](https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-yum-repo) or [APT](https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-apt-repo) Repository.

2. Run `make -j ${nproc} USE_BLAS=mkl`

Expand Down
2 changes: 2 additions & 0 deletions docs/tutorials/mkldnn/operator_list.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ To help users understanding MKL-DNN backend better, the following table summariz
| **elemwise_add** | 1D-4D input | Y | Y | Y |
| **Concat** | 1D-4D input | Y | Y | Y |
| **slice** | 1D-4D input | N | Y | N |
| **Reshape** | 1D-4D input | N | Y | N |
| **Flatten** | 1D-4D input | N | Y | N |
| **Quantization** | 1D-4D input | N | N | Y |
| **Dequantization** | 1D-4D input | N | N | Y |
| **Requantization** | 1D-4D input | N | N | Y |
Expand Down
116 changes: 31 additions & 85 deletions perl-package/AI-MXNet/lib/AI/MXNet.pm
Original file line number Diff line number Diff line change
Expand Up @@ -19,105 +19,51 @@ package AI::MXNet;
use v5.14.0;
use strict;
use warnings;
use AI::MXNet::NS 'global';
use AI::MXNet::Base;
use AI::MXNet::Callback;
use AI::MXNet::NDArray;
use AI::MXNet::Symbol;
use AI::MXNet::Callback 'callback';
use AI::MXNet::NDArray qw(nd ndarray);
use AI::MXNet::Context 'context';
use AI::MXNet::Symbol qw(sym symbol);
use AI::MXNet::Executor;
use AI::MXNet::Executor::Group;
use AI::MXNet::CudaModule;
use AI::MXNet::Random;
use AI::MXNet::Initializer;
use AI::MXNet::Optimizer;
use AI::MXNet::KVStore;
use AI::MXNet::Random qw(rnd random);
use AI::MXNet::Initializer qw(init initializer);
use AI::MXNet::Optimizer qw(optimizer opt);
use AI::MXNet::KVStore 'kv';
use AI::MXNet::KVStoreServer;
use AI::MXNet::IO;
use AI::MXNet::Metric;
use AI::MXNet::IO 'io';
use AI::MXNet::Metric 'metric';
use AI::MXNet::LRScheduler;
use AI::MXNet::Monitor;
use AI::MXNet::Monitor 'mon';
use AI::MXNet::Profiler;
use AI::MXNet::Module::Base;
use AI::MXNet::Module;
use AI::MXNet::Module qw(mod module);
use AI::MXNet::Module::Bucketing;
use AI::MXNet::RNN;
use AI::MXNet::Visualization;
use AI::MXNet::RecordIO;
use AI::MXNet::Image;
use AI::MXNet::Contrib;
use AI::MXNet::LinAlg;
use AI::MXNet::RNN 'rnn';
use AI::MXNet::Visualization 'viz';
use AI::MXNet::RecordIO 'recordio';
use AI::MXNet::Image qw(img image);
use AI::MXNet::Contrib 'contrib';
use AI::MXNet::LinAlg 'linalg';
use AI::MXNet::CachedOp;
use AI::MXNet::AutoGrad;
use AI::MXNet::Gluon;
use AI::MXNet::AutoGrad 'autograd';
use AI::MXNet::Gluon 'gluon';
use AI::MXNet::NDArray::Sparse;
use AI::MXNet::Symbol::Sparse;
use AI::MXNet::Engine;
use AI::MXNet::Engine 'engine';
our $VERSION = '1.4';

sub import
{
my ($class, $short_name) = @_;
if($short_name)
{
$short_name =~ s/[^\w:]//g;
if(length $short_name)
{
my $short_name_package =<<"EOP";
package $short_name;
no warnings 'redefine';
sub nd { 'AI::MXNet::NDArray' }
sub ndarray { 'AI::MXNet::NDArray' }
sub sym { 'AI::MXNet::Symbol' }
sub symbol { 'AI::MXNet::Symbol' }
sub init { 'AI::MXNet::Initializer' }
sub initializer { 'AI::MXNet::Initializer' }
sub optimizer { 'AI::MXNet::Optimizer' }
sub opt { 'AI::MXNet::Optimizer' }
sub rnd { 'AI::MXNet::Random' }
sub random { 'AI::MXNet::Random' }
sub Context { shift; AI::MXNet::Context->new(\@_) }
sub context { 'AI::MXNet::Context' }
sub cpu { AI::MXNet::Context->cpu(\$_[1]//0) }
sub cpu_pinned { AI::MXNet::Context->cpu_pinned(\$_[1]//0) }
sub gpu { AI::MXNet::Context->gpu(\$_[1]//0) }
sub kv { 'AI::MXNet::KVStore' }
sub recordio { 'AI::MXNet::RecordIO' }
sub io { 'AI::MXNet::IO' }
sub metric { 'AI::MXNet::Metric' }
sub mod { 'AI::MXNet::Module' }
sub module { 'AI::MXNet::Module' }
sub mon { 'AI::MXNet::Monitor' }
sub viz { 'AI::MXNet::Visualization' }
sub rnn { 'AI::MXNet::RNN' }
sub callback { 'AI::MXNet::Callback' }
sub img { 'AI::MXNet::Image' }
sub image { 'AI::MXNet::Image' }
sub contrib { 'AI::MXNet::Contrib' }
sub linalg { 'AI::MXNet::LinAlg' }
sub autograd { 'AI::MXNet::AutoGrad' }
sub engine { 'AI::MXNet::Engine' }
sub name { '$short_name' }
sub rtc { '$short_name' }
sub gluon { 'AI::MXNet::Gluon' }
sub CudaModule { shift; AI::MXNet::CudaModule->new(\@_) }
sub AttrScope { shift; AI::MXNet::Symbol::AttrScope->new(\@_) }
*AI::MXNet::Symbol::AttrScope::current = sub { \$${short_name}::AttrScope; };
\$${short_name}::AttrScope = AI::MXNet::Symbol::AttrScope->new;
sub Prefix { AI::MXNet::Symbol::Prefix->new(prefix => \$_[1]) }
*AI::MXNet::Symbol::NameManager::current = sub { \$${short_name}::NameManager; };
*AI::MXNet::Symbol::NameManager::set_current = sub { \$${short_name}::NameManager = \$_[1]; };
\$${short_name}::NameManager = AI::MXNet::Symbol::NameManager->new;
*AI::MXNet::Context::current_ctx = sub { \$${short_name}::Context; };
*AI::MXNet::Context::current_context = sub { \$${short_name}::Context; };
*AI::MXNet::Context::set_current = sub { \$${short_name}::Context = \$_[1]; };
\$${short_name}::Context = AI::MXNet::Context->new(device_type => 'cpu', device_id => 0);
package nd;
\@nd::ISA = ('AI::MXNet::NDArray');
1;
EOP
eval $short_name_package;
}
}
}
sub cpu { AI::MXNet::Context->cpu($_[1]//0) }
sub cpu_pinned { AI::MXNet::Context->cpu_pinned($_[1]//0) }
sub gpu { AI::MXNet::Context->gpu($_[1]//0) }
sub name { __PACKAGE__ }
sub rtc { __PACKAGE__ }
sub Prefix { AI::MXNet::Symbol::Prefix->new(prefix => $_[1]) }
our $AttrScope = AI::MXNet::Symbol::AttrScope->new;
our $NameManager = AI::MXNet::Symbol::NameManager->new;
our $Context = AI::MXNet::Context->new(device_type => 'cpu', device_id => 0);

1;
__END__
Expand Down
19 changes: 1 addition & 18 deletions perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,12 @@
package AI::MXNet::AutoGrad;
use strict;
use warnings;
use AI::MXNet::NS 'global';
use AI::MXNet::Base;
use AI::MXNet::Function::Parameters;
use Scalar::Util qw(blessed);
use Carp qw(confess);

sub import
{
my ($class, $short_name) = @_;
if($short_name)
{
$short_name =~ s/[^\w:]//g;
if(length $short_name)
{
my $short_name_package =<<"EOP";
package $short_name;
use parent 'AI::MXNet::AutoGrad';
1;
EOP
eval $short_name_package;
}
}
}

=head1 NAME
AI::MXNet::AutoGrad - Autograd for NDArray.
Expand Down
Loading

0 comments on commit cb051fe

Please sign in to comment.